In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import os
import time
import pyspark
from sklearn.metrics import roc_auc_score
In [2]:
ls
'AML Class code'/ grid_search_results_xgb.csv best_nn_model.keras Modelling.ipynb best_xgb_model.json my_model.h5 'Credit Risk Project.docx'* 'Older Modelling File.ipynb' DataPrepFull_Modelling-Approach2+Strat.ipynb Read.ipynb DataPrepFull_Modelling.ipynb sample_submission.csv* dev.csv/ test_data.csv* Exploration.ipynb train_data.csv* feature_importance_xgb_m1.xlsx train_labels.csv* feature_importance_xgb_m2.xlsx Untitled1.ipynb 'Final_DAta Prep Steps.ipynb' Untitled2.ipynb grid_search_results.csv Untitled.ipynb grid_search_results_nn.csv X_train.csv
In [3]:
df = pd.read_csv('dev.csv/part-00000-ee748d50-0c69-46e0-bdfd-03dac1fb4272-c000.csv')
In [4]:
# set options to see the dataframe entirely
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
In [5]:
df.head(5)
Out[5]:
| customer_ID | target | S_2 | P_2 | D_39 | B_1 | B_2 | R_1 | S_3 | D_41 | B_3 | D_42 | D_43 | D_44 | B_4 | D_45 | B_5 | R_2 | D_46 | D_47 | D_48 | D_49 | B_6 | B_7 | B_8 | D_50 | D_51 | B_9 | R_3 | D_52 | P_3 | B_10 | D_53 | S_5 | B_11 | S_6 | D_54 | R_4 | S_7 | B_12 | S_8 | D_55 | D_56 | B_13 | R_5 | D_58 | S_9 | B_14 | D_59 | D_60 | D_61 | B_15 | S_11 | D_62 | D_63 | D_64 | D_65 | B_16 | B_17 | B_18 | B_19 | D_66 | B_20 | D_68 | S_12 | R_6 | S_13 | B_21 | D_69 | B_22 | D_70 | D_71 | D_72 | S_15 | B_23 | D_73 | P_4 | D_74 | D_75 | D_76 | B_24 | R_7 | D_77 | B_25 | B_26 | D_78 | D_79 | R_8 | R_9 | S_16 | D_80 | R_10 | R_11 | B_27 | D_81 | D_82 | S_17 | R_12 | B_28 | R_13 | D_83 | R_14 | R_15 | D_84 | R_16 | B_29 | B_30 | S_18 | D_86 | D_87 | R_17 | R_18 | D_88 | B_31 | S_19 | R_19 | B_32 | S_20 | R_20 | R_21 | B_33 | D_89 | R_22 | R_23 | D_91 | D_92 | D_93 | D_94 | R_24 | R_25 | D_96 | S_22 | S_23 | S_24 | S_25 | S_26 | D_102 | D_103 | D_104 | D_105 | D_106 | D_107 | B_36 | B_37 | R_26 | R_27 | B_38 | D_108 | D_109 | D_110 | D_111 | B_39 | D_112 | B_40 | S_27 | D_113 | D_114 | D_115 | D_116 | D_117 | D_118 | D_119 | D_120 | D_121 | D_122 | D_123 | D_124 | D_125 | D_126 | D_127 | D_128 | D_129 | B_41 | B_42 | D_130 | D_131 | D_132 | D_133 | R_28 | D_134 | D_135 | D_136 | D_137 | D_138 | D_139 | D_140 | D_141 | D_142 | D_143 | D_144 | D_145 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a | 0 | 2017-03-09 | 0.938469 | 0.001733 | 0.008724 | 1.006838 | 0.009228 | 0.124035 | 0.008771 | 0.004709 | NaN | NaN | 0.000630 | 0.080986 | 0.708906 | 0.170600 | 0.006204 | 0.358587 | 0.525351 | 0.255736 | NaN | 0.063902 | 0.059416 | 0.006466 | 0.148698 | 1.335856 | 0.008207 | 0.001423 | 0.207334 | 0.736463 | 0.096219 | NaN | 0.023381 | 0.002768 | 0.008322 | 1.001519 | 0.008298 | 0.161345 | 0.148266 | 0.922998 | 0.354596 | 0.152025 | 0.118075 | 0.001882 | 0.158612 | 0.065728 | 0.018385 | 0.063646 | 0.199617 | 0.308233 | 0.016361 | 0.401619 | 0.091071 | CR | O | 0.007126 | 0.007665 | NaN | 0.652984 | 0.008520 | NaN | 0.004730 | 6.0 | 0.272008 | 0.008363 | 0.515222 | 0.002644 | 0.009013 | 0.004808 | 0.008342 | 0.119403 | 0.004802 | 0.108271 | 0.050882 | NaN | 0.007554 | 0.080422 | 0.069067 | NaN | 0.004327 | 0.007562 | NaN | 0.007729 | 0.000272 | 0.001576 | 0.004239 | 0.001434 | NaN | 0.002271 | 0.004061 | 0.007121 | 0.002456 | 0.002310 | 0.003532 | 0.506612 | 0.008033 | 1.009825 | 0.084683 | 0.003820 | 0.007043 | 0.000438 | 0.006452 | 0.000830 | 0.005055 | NaN | 0.0 | 0.005720 | 0.007084 | NaN | 0.000198 | 0.008907 | NaN | 1 | 0.002537 | 0.005177 | 0.006626 | 0.009705 | 0.007782 | 0.002450 | 1.001101 | 0.002665 | 0.007479 | 0.006893 | 1.503673 | 1.006133 | 0.003569 | 0.008871 | 0.003950 | 0.003647 | 0.004950 | 0.894090 | 0.135561 | 0.911191 | 0.974539 | 0.001243 | 0.766688 | 1.008691 | 1.004587 | 0.893734 | NaN | 0.670041 | 0.009968 | 0.004572 | NaN | 1.008949 | 2.0 | NaN | 0.004326 | NaN | NaN | NaN | 1.007336 | 0.210060 | 0.676922 | 0.007871 | 1.0 | 0.238250 | 0.0 | 4.0 | 0.232120 | 0.236266 | 0.0 | 0.702280 | 0.434345 | 0.003057 | 0.686516 | 0.008740 | 1.0 | 1.003319 | 1.007819 | 1.000080 | 0.006805 | NaN | 0.002052 | 0.005972 | NaN | 0.004345 | 0.001535 | NaN | NaN | NaN | NaN | NaN | 0.002427 | 0.003706 | 0.003818 | NaN | 0.000569 | 0.000610 | 0.002674 |
| 1 | 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a | 0 | 2017-04-07 | 0.936665 | 0.005775 | 0.004923 | 1.000653 | 0.006151 | 0.126750 | 0.000798 | 0.002714 | NaN | NaN | 0.002526 | 0.069419 | 0.712795 | 0.113239 | 0.006206 | 0.353630 | 0.521311 | 0.223329 | NaN | 0.065261 | 0.057744 | 0.001614 | 0.149723 | 1.339794 | 0.008373 | 0.001984 | 0.202778 | 0.720886 | 0.099804 | NaN | 0.030599 | 0.002749 | 0.002482 | 1.009033 | 0.005136 | 0.140951 | 0.143530 | 0.919414 | 0.326757 | 0.156201 | 0.118737 | 0.001610 | 0.148459 | 0.093935 | 0.013035 | 0.065501 | 0.151387 | 0.265026 | 0.017688 | 0.406326 | 0.086805 | CR | O | 0.002413 | 0.007148 | NaN | 0.647093 | 0.002238 | NaN | 0.003879 | 6.0 | 0.188970 | 0.004030 | 0.509048 | 0.004193 | 0.007842 | 0.001283 | 0.006524 | 0.140611 | 0.000094 | 0.101018 | 0.040469 | NaN | 0.004832 | 0.081413 | 0.074166 | NaN | 0.004203 | 0.005304 | NaN | 0.001864 | 0.000979 | 0.009896 | 0.007597 | 0.000509 | NaN | 0.009810 | 0.000127 | 0.005966 | 0.000395 | 0.001327 | 0.007773 | 0.500855 | 0.000760 | 1.009461 | 0.081843 | 0.000347 | 0.007789 | 0.004311 | 0.002332 | 0.009469 | 0.003753 | NaN | 0.0 | 0.007584 | 0.006677 | NaN | 0.001142 | 0.005907 | NaN | 1 | 0.008427 | 0.008979 | 0.001854 | 0.009924 | 0.005987 | 0.002247 | 1.006779 | 0.002508 | 0.006827 | 0.002837 | 1.503577 | 1.005791 | 0.000571 | 0.000391 | 0.008351 | 0.008850 | 0.003180 | 0.902135 | 0.136333 | 0.919876 | 0.975624 | 0.004561 | 0.786007 | 1.000084 | 1.004118 | 0.906841 | NaN | 0.668647 | 0.003921 | 0.004654 | NaN | 1.003205 | 2.0 | NaN | 0.008707 | NaN | NaN | NaN | 1.007653 | 0.184093 | 0.822281 | 0.003444 | 1.0 | 0.247217 | 0.0 | 4.0 | 0.243532 | 0.241885 | 0.0 | 0.707017 | 0.430501 | 0.001306 | 0.686414 | 0.000755 | 1.0 | 1.008394 | 1.004333 | 1.008344 | 0.004407 | NaN | 0.001034 | 0.004838 | NaN | 0.007495 | 0.004931 | NaN | NaN | NaN | NaN | NaN | 0.003954 | 0.003167 | 0.005032 | NaN | 0.009576 | 0.005492 | 0.009217 |
| 2 | 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a | 0 | 2017-05-28 | 0.954180 | 0.091505 | 0.021655 | 1.009672 | 0.006815 | 0.123977 | 0.007598 | 0.009423 | NaN | NaN | 0.007605 | 0.068839 | 0.720884 | 0.060492 | 0.003259 | 0.334650 | 0.524568 | 0.189424 | NaN | 0.066982 | 0.056647 | 0.005126 | 0.151955 | 1.337179 | 0.009355 | 0.007426 | 0.206629 | 0.738044 | 0.134073 | NaN | 0.048367 | 0.010077 | 0.000530 | 1.009184 | 0.006961 | 0.112229 | 0.137014 | 1.001977 | 0.304124 | 0.153795 | 0.114534 | 0.006328 | 0.139504 | 0.084757 | 0.056653 | 0.070607 | 0.305883 | 0.212165 | 0.063955 | 0.406768 | 0.094001 | CR | O | 0.001878 | 0.003636 | NaN | 0.645819 | 0.000408 | NaN | 0.004578 | 6.0 | 0.495308 | 0.006838 | 0.679257 | 0.001337 | 0.006025 | 0.009393 | 0.002615 | 0.075868 | 0.007152 | 0.103239 | 0.047454 | NaN | 0.006561 | 0.078891 | 0.076510 | NaN | 0.001782 | 0.001422 | NaN | 0.005419 | 0.006149 | 0.009629 | 0.003094 | 0.008295 | NaN | 0.009362 | 0.000954 | 0.005447 | 0.007345 | 0.007624 | 0.008811 | 0.504606 | 0.004056 | 1.004291 | 0.081954 | 0.002709 | 0.004093 | 0.007139 | 0.008358 | 0.002325 | 0.007381 | NaN | 0.0 | 0.005901 | 0.001185 | NaN | 0.008013 | 0.008882 | NaN | 1 | 0.007327 | 0.002016 | 0.008686 | 0.008446 | 0.007291 | 0.007794 | 1.001014 | 0.009634 | 0.009820 | 0.005080 | 1.503359 | 1.005801 | 0.007425 | 0.009234 | 0.002471 | 0.009769 | 0.005433 | 0.939654 | 0.134938 | 0.958699 | 0.974067 | 0.011736 | 0.806840 | 1.003014 | 1.009285 | 0.928719 | NaN | 0.670901 | 0.001264 | 0.019176 | NaN | 1.000754 | 2.0 | NaN | 0.004092 | NaN | NaN | NaN | 1.004312 | 0.154837 | 0.853498 | 0.003269 | 1.0 | 0.239867 | 0.0 | 4.0 | 0.240768 | 0.239710 | 0.0 | 0.704843 | 0.434409 | 0.003954 | 0.690101 | 0.009617 | 1.0 | 1.009307 | 1.007831 | 1.006878 | 0.003221 | NaN | 0.005681 | 0.005497 | NaN | 0.009227 | 0.009123 | NaN | NaN | NaN | NaN | NaN | 0.003269 | 0.007329 | 0.000427 | NaN | 0.003429 | 0.006986 | 0.002603 |
| 3 | 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a | 0 | 2017-06-13 | 0.960384 | 0.002455 | 0.013683 | 1.002700 | 0.001373 | 0.117169 | 0.000685 | 0.005531 | NaN | NaN | 0.006406 | 0.055630 | 0.723997 | 0.166782 | 0.009918 | 0.323271 | 0.530929 | 0.135586 | NaN | 0.083720 | 0.049253 | 0.001418 | 0.151219 | 1.339909 | 0.006782 | 0.003515 | 0.208214 | 0.741813 | 0.134437 | NaN | 0.030063 | 0.009667 | 0.000783 | 1.007456 | 0.008706 | 0.102838 | 0.129017 | 0.704016 | 0.275055 | 0.155772 | 0.120740 | 0.004980 | 0.138100 | 0.048382 | 0.012498 | 0.065926 | 0.273553 | 0.204300 | 0.022732 | 0.405175 | 0.094854 | CR | O | 0.005899 | 0.005896 | NaN | 0.654358 | 0.005897 | NaN | 0.005207 | 6.0 | 0.508670 | 0.008183 | 0.515282 | 0.008716 | 0.005271 | 0.004554 | 0.002052 | 0.150209 | 0.005364 | 0.206394 | 0.031705 | NaN | 0.009559 | 0.077490 | 0.071547 | NaN | 0.005595 | 0.006363 | NaN | 0.000646 | 0.009193 | 0.008568 | 0.003895 | 0.005153 | NaN | 0.004876 | 0.005665 | 0.001888 | 0.004961 | 0.000034 | 0.004652 | 0.508998 | 0.006969 | 1.004728 | 0.060634 | 0.009982 | 0.008817 | 0.008690 | 0.007364 | 0.005924 | 0.008802 | NaN | 0.0 | 0.002520 | 0.003324 | NaN | 0.009455 | 0.008348 | NaN | 1 | 0.007053 | 0.003909 | 0.002478 | 0.006614 | 0.009977 | 0.007686 | 1.002775 | 0.007791 | 0.000458 | 0.007320 | 1.503701 | 1.007036 | 0.000664 | 0.003200 | 0.008507 | 0.004858 | 0.000063 | 0.913205 | 0.140058 | 0.926341 | 0.975499 | 0.007571 | 0.808214 | 1.001517 | 1.004514 | 0.935383 | NaN | 0.672620 | 0.002729 | 0.011720 | NaN | 1.005338 | 2.0 | NaN | 0.009703 | NaN | NaN | NaN | 1.002538 | 0.153939 | 0.844667 | 0.000053 | 1.0 | 0.240910 | 0.0 | 4.0 | 0.239400 | 0.240727 | 0.0 | 0.711546 | 0.436903 | 0.005135 | 0.687779 | 0.004649 | 1.0 | 1.001671 | 1.003460 | 1.007573 | 0.007703 | NaN | 0.007108 | 0.008261 | NaN | 0.007206 | 0.002409 | NaN | NaN | NaN | NaN | NaN | 0.006117 | 0.004516 | 0.003200 | NaN | 0.008419 | 0.006527 | 0.009600 |
| 4 | 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a | 0 | 2017-07-16 | 0.947248 | 0.002483 | 0.015193 | 1.000727 | 0.007605 | 0.117325 | 0.004653 | 0.009312 | NaN | NaN | 0.007731 | 0.038862 | 0.720619 | 0.143630 | 0.006667 | 0.231009 | 0.529305 | NaN | NaN | 0.075900 | 0.048918 | 0.001199 | 0.154026 | 1.341735 | 0.000519 | 0.001362 | 0.205468 | 0.691986 | 0.121518 | NaN | 0.054221 | 0.009484 | 0.006698 | 1.003738 | 0.003846 | 0.094311 | 0.129539 | 0.917133 | 0.231110 | 0.154914 | 0.095178 | 0.001653 | 0.126443 | 0.039259 | 0.027897 | 0.063697 | 0.233103 | 0.175655 | 0.031171 | 0.487460 | 0.093915 | CR | O | 0.009479 | 0.001714 | NaN | 0.650112 | 0.007773 | NaN | 0.005851 | 6.0 | 0.216507 | 0.008605 | 0.507712 | 0.006821 | 0.000152 | 0.000104 | 0.001419 | 0.096441 | 0.007972 | 0.106020 | 0.032733 | NaN | 0.008156 | 0.076561 | 0.074432 | NaN | 0.004933 | 0.004831 | NaN | 0.001833 | 0.005738 | 0.003289 | 0.002608 | 0.007338 | NaN | 0.007447 | 0.004465 | 0.006111 | 0.002246 | 0.002109 | 0.001141 | 0.506213 | 0.001770 | 1.000904 | 0.062492 | 0.005860 | 0.001845 | 0.007816 | 0.002470 | 0.005516 | 0.007166 | NaN | 0.0 | 0.000155 | 0.001504 | NaN | 0.002019 | 0.002678 | NaN | 1 | 0.007728 | 0.003432 | 0.002199 | 0.005511 | 0.004105 | 0.009656 | 1.006536 | 0.005158 | 0.003341 | 0.000264 | 1.509905 | 1.002915 | 0.003079 | 0.003845 | 0.007190 | 0.002983 | 0.000535 | 0.921026 | 0.131620 | 0.933479 | 0.978027 | 0.018200 | 0.822281 | 1.006125 | 1.005735 | 0.953363 | NaN | 0.673869 | 0.009998 | 0.017598 | NaN | 1.003175 | 2.0 | NaN | 0.009120 | NaN | NaN | NaN | 1.000130 | 0.120717 | 0.811199 | 0.008724 | 1.0 | 0.247939 | 0.0 | 4.0 | 0.244199 | 0.242325 | 0.0 | 0.705343 | 0.437433 | 0.002849 | 0.688774 | 0.000097 | 1.0 | 1.009886 | 1.005053 | 1.008132 | 0.009823 | NaN | 0.009680 | 0.004848 | NaN | 0.006312 | 0.004462 | NaN | NaN | NaN | NaN | NaN | 0.003671 | 0.004946 | 0.008889 | NaN | 0.001670 | 0.008126 | 0.009827 |
In [6]:
df.shape
Out[6]:
(1103628, 191)
In [7]:
df = df.sample(frac=0.7, random_state=42)
In [8]:
df.shape
Out[8]:
(772540, 191)
In [9]:
df['customer_ID'].nunique()
Out[9]:
91149
In [10]:
# Re Organize the dataframe.
fixed_columns = ['customer_ID', 'S_2', 'target'] # since s_2 is the date column
other_columns = sorted([col for col in df.columns if col not in fixed_columns])
new_column_order = fixed_columns + other_columns
df = df[new_column_order]
In [11]:
from collections import Counter
initial_chars = [col[0] for col in df.columns if col[0].isalpha()]
initial_char_counts = Counter(initial_chars)
initial_char_counts_df = pd.DataFrame(list(initial_char_counts.items()), columns=['Initial_Char', 'Count'])
mapping = {
'c': 'Customer ID',
's': 'Spend Variables',
't': 'Target',
'B': 'Balance Variables',
'D': 'Delinquency Variables',
'P': 'Payment Variables',
'R': 'Risk Variables',
}
initial_char_counts_df['Description'] = initial_char_counts_df['Initial_Char'].map(lambda x: mapping.get(x, 'Other'))
initial_char_counts_df
Out[11]:
| Initial_Char | Count | Description | |
|---|---|---|---|
| 0 | c | 1 | Customer ID |
| 1 | S | 22 | Other |
| 2 | t | 1 | Target |
| 3 | B | 40 | Balance Variables |
| 4 | D | 96 | Delinquency Variables |
| 5 | P | 3 | Payment Variables |
| 6 | R | 28 | Risk Variables |
In [12]:
import plotly.graph_objects as go
# Creating the plot
fig = go.Figure()
fig.add_trace(go.Bar(x=initial_char_counts_df['Description'],
y=initial_char_counts_df['Count'],
marker_color='rgb(55, 83, 109)'))
fig.update_layout(title='Count of Columns by Description',
xaxis=dict(title='Description'),
yaxis=dict(title='Count'),
height=600)
fig.show()
Customer History Buckets¶
In [13]:
customer_counts = df['customer_ID'].value_counts()
counts_df = pd.DataFrame(customer_counts).reset_index()
counts_df.columns = ['c_id', 'counts']
# grouping by counts to calculate number of customers
count_summary = counts_df.groupby('counts')['c_id'].nunique().reset_index()
# renaming the columns
count_summary.columns = ['count_bucket', 'num_customers']
total_customers = count_summary['num_customers'].sum()
# Calculate the percentage of total for num_customers
count_summary['percent_of_total'] = (count_summary['num_customers'] / total_customers) * 100
# Show the DataFrame
count_summary
Out[13]:
| count_bucket | num_customers | percent_of_total | |
|---|---|---|---|
| 0 | 1 | 1574 | 1.726843 |
| 1 | 2 | 1602 | 1.757562 |
| 2 | 3 | 1551 | 1.701609 |
| 3 | 4 | 1746 | 1.915545 |
| 4 | 5 | 2694 | 2.955600 |
| 5 | 6 | 5040 | 5.529408 |
| 6 | 7 | 9596 | 10.527817 |
| 7 | 8 | 15315 | 16.802159 |
| 8 | 9 | 19019 | 20.865835 |
| 9 | 10 | 17294 | 18.973329 |
| 10 | 11 | 10770 | 11.815818 |
| 11 | 12 | 4182 | 4.588092 |
| 12 | 13 | 766 | 0.840382 |
Dropping Customers with less than 3 months of data¶
In [14]:
customer_counts = df['customer_ID'].value_counts()
# Filter customer_counts to get IDs of customers who appear 3 times or more
customers_to_keep = customer_counts[customer_counts >= 3].index
# Filter the original DataFrame to keep only rows with these customer IDs
filtered_df = df[df['customer_ID'].isin(customers_to_keep)]
customer_counts = filtered_df['customer_ID'].value_counts()
counts_df = pd.DataFrame(customer_counts).reset_index()
counts_df.columns = ['c_id', 'counts']
# grouping by counts to calculate number of customers
count_summary = counts_df.groupby('counts')['c_id'].nunique().reset_index()
# renaming the columns
count_summary.columns = ['count_bucket', 'num_customers']
# show dataframe
count_summary
Out[14]:
| count_bucket | num_customers | |
|---|---|---|
| 0 | 3 | 1551 |
| 1 | 4 | 1746 |
| 2 | 5 | 2694 |
| 3 | 6 | 5040 |
| 4 | 7 | 9596 |
| 5 | 8 | 15315 |
| 6 | 9 | 19019 |
| 7 | 10 | 17294 |
| 8 | 11 | 10770 |
| 9 | 12 | 4182 |
| 10 | 13 | 766 |
In [15]:
df = filtered_df
Convert S_2 to datetime¶
In [16]:
df['S_2'] = pd.to_datetime(df['S_2'])
print(df['S_2'].dtype)
datetime64[ns]
Missing Value Analysis¶
In [17]:
# See if there any null values
v1 = df.isnull().sum()
v2 = ((df.isnull().sum()/len(df))*100).round(5)
[f'{percent:.2f}%' for percent in v2]
null_df = pd.DataFrame({'Null Value Count':v1,
'Null Value Percent':[f'{percent:.2f}%' for percent in v2]})
null_df.sort_values(by='Null Value Count', ascending=False)
Out[17]:
| Null Value Count | Null Value Percent | |
|---|---|---|
| D_87 | 767174 | 99.92% |
| D_88 | 766977 | 99.90% |
| D_108 | 763651 | 99.46% |
| D_110 | 763449 | 99.44% |
| D_111 | 763449 | 99.44% |
| B_39 | 763137 | 99.40% |
| D_73 | 759973 | 98.99% |
| B_42 | 757425 | 98.65% |
| D_136 | 739740 | 96.35% |
| D_138 | 739740 | 96.35% |
| D_135 | 739740 | 96.35% |
| D_134 | 739740 | 96.35% |
| D_137 | 739740 | 96.35% |
| R_9 | 724561 | 94.37% |
| B_29 | 715278 | 93.16% |
| D_106 | 692648 | 90.22% |
| D_132 | 692482 | 90.19% |
| D_49 | 692046 | 90.14% |
| R_26 | 682101 | 88.84% |
| D_76 | 681881 | 88.81% |
| D_66 | 681104 | 88.71% |
| D_42 | 660687 | 86.05% |
| D_142 | 637449 | 83.03% |
| D_53 | 567437 | 73.91% |
| D_82 | 564019 | 73.46% |
| B_17 | 435034 | 56.66% |
| D_50 | 434760 | 56.63% |
| D_105 | 420609 | 54.78% |
| D_56 | 414084 | 53.93% |
| S_9 | 407525 | 53.08% |
| D_77 | 348458 | 45.39% |
| D_43 | 228077 | 29.71% |
| S_27 | 193536 | 25.21% |
| D_46 | 165914 | 21.61% |
| S_3 | 141795 | 18.47% |
| S_7 | 141795 | 18.47% |
| D_62 | 104847 | 13.66% |
| D_48 | 99703 | 12.99% |
| D_61 | 82682 | 10.77% |
| P_3 | 38415 | 5.00% |
| D_44 | 37758 | 4.92% |
| D_78 | 37758 | 4.92% |
| D_64 | 27859 | 3.63% |
| D_68 | 27737 | 3.61% |
| D_55 | 24939 | 3.25% |
| D_69 | 24586 | 3.20% |
| D_83 | 24586 | 3.20% |
| D_116 | 22082 | 2.88% |
| D_125 | 22082 | 2.88% |
| D_124 | 22082 | 2.88% |
| D_123 | 22082 | 2.88% |
| D_122 | 22082 | 2.88% |
| D_121 | 22082 | 2.88% |
| D_120 | 22082 | 2.88% |
| D_119 | 22082 | 2.88% |
| D_118 | 22082 | 2.88% |
| D_117 | 22082 | 2.88% |
| D_115 | 22082 | 2.88% |
| D_114 | 22082 | 2.88% |
| D_113 | 22082 | 2.88% |
| D_91 | 18905 | 2.46% |
| D_126 | 16140 | 2.10% |
| R_27 | 14530 | 1.89% |
| D_130 | 13288 | 1.73% |
| D_143 | 13288 | 1.73% |
| D_145 | 13288 | 1.73% |
| D_104 | 13288 | 1.73% |
| D_103 | 13288 | 1.73% |
| D_141 | 13288 | 1.73% |
| D_131 | 13288 | 1.73% |
| D_139 | 13288 | 1.73% |
| D_129 | 13288 | 1.73% |
| D_128 | 13288 | 1.73% |
| D_107 | 13288 | 1.73% |
| D_59 | 13107 | 1.71% |
| D_70 | 11896 | 1.55% |
| D_79 | 9786 | 1.27% |
| P_2 | 6123 | 0.80% |
| B_13 | 6089 | 0.79% |
| D_133 | 5858 | 0.76% |
| D_144 | 5575 | 0.73% |
| D_102 | 5569 | 0.73% |
| D_140 | 5566 | 0.72% |
| D_52 | 3875 | 0.50% |
| D_89 | 3875 | 0.50% |
| D_84 | 3875 | 0.50% |
| D_81 | 3271 | 0.43% |
| D_72 | 3000 | 0.39% |
| D_74 | 2764 | 0.36% |
| D_80 | 2764 | 0.36% |
| S_22 | 2588 | 0.34% |
| S_24 | 2520 | 0.33% |
| B_8 | 2509 | 0.33% |
| S_25 | 1935 | 0.25% |
| B_25 | 997 | 0.13% |
| B_15 | 997 | 0.13% |
| D_112 | 363 | 0.05% |
| B_20 | 269 | 0.04% |
| B_19 | 269 | 0.04% |
| B_2 | 269 | 0.04% |
| B_16 | 269 | 0.04% |
| D_45 | 269 | 0.04% |
| D_54 | 269 | 0.04% |
| B_22 | 269 | 0.04% |
| B_38 | 269 | 0.04% |
| B_26 | 269 | 0.04% |
| B_27 | 269 | 0.04% |
| D_41 | 269 | 0.04% |
| B_3 | 269 | 0.04% |
| B_30 | 269 | 0.04% |
| B_33 | 269 | 0.04% |
| D_109 | 197 | 0.03% |
| B_41 | 102 | 0.01% |
| S_26 | 94 | 0.01% |
| S_23 | 62 | 0.01% |
| R_20 | 11 | 0.00% |
| B_37 | 8 | 0.00% |
| R_12 | 8 | 0.00% |
| B_40 | 6 | 0.00% |
| B_6 | 6 | 0.00% |
| B_11 | 0 | 0.00% |
| R_4 | 0 | 0.00% |
| S_12 | 0 | 0.00% |
| R_21 | 0 | 0.00% |
| R_22 | 0 | 0.00% |
| R_23 | 0 | 0.00% |
| S_13 | 0 | 0.00% |
| R_24 | 0 | 0.00% |
| R_25 | 0 | 0.00% |
| B_12 | 0 | 0.00% |
| S_8 | 0 | 0.00% |
| target | 0 | 0.00% |
| S_15 | 0 | 0.00% |
| R_28 | 0 | 0.00% |
| R_3 | 0 | 0.00% |
| R_5 | 0 | 0.00% |
| R_8 | 0 | 0.00% |
| S_6 | 0 | 0.00% |
| S_11 | 0 | 0.00% |
| S_5 | 0 | 0.00% |
| R_6 | 0 | 0.00% |
| R_7 | 0 | 0.00% |
| B_1 | 0 | 0.00% |
| B_10 | 0 | 0.00% |
| R_19 | 0 | 0.00% |
| S_20 | 0 | 0.00% |
| S_19 | 0 | 0.00% |
| S_18 | 0 | 0.00% |
| S_17 | 0 | 0.00% |
| S_16 | 0 | 0.00% |
| R_2 | 0 | 0.00% |
| B_5 | 0 | 0.00% |
| R_18 | 0 | 0.00% |
| D_39 | 0 | 0.00% |
| D_58 | 0 | 0.00% |
| B_21 | 0 | 0.00% |
| D_51 | 0 | 0.00% |
| B_23 | 0 | 0.00% |
| B_24 | 0 | 0.00% |
| S_2 | 0 | 0.00% |
| D_47 | 0 | 0.00% |
| B_28 | 0 | 0.00% |
| D_63 | 0 | 0.00% |
| B_31 | 0 | 0.00% |
| B_32 | 0 | 0.00% |
| B_36 | 0 | 0.00% |
| D_127 | 0 | 0.00% |
| B_4 | 0 | 0.00% |
| B_9 | 0 | 0.00% |
| B_7 | 0 | 0.00% |
| D_60 | 0 | 0.00% |
| D_65 | 0 | 0.00% |
| R_17 | 0 | 0.00% |
| P_4 | 0 | 0.00% |
| R_16 | 0 | 0.00% |
| R_15 | 0 | 0.00% |
| R_14 | 0 | 0.00% |
| R_13 | 0 | 0.00% |
| R_11 | 0 | 0.00% |
| R_10 | 0 | 0.00% |
| R_1 | 0 | 0.00% |
| D_96 | 0 | 0.00% |
| D_71 | 0 | 0.00% |
| D_94 | 0 | 0.00% |
| D_93 | 0 | 0.00% |
| D_92 | 0 | 0.00% |
| B_14 | 0 | 0.00% |
| D_86 | 0 | 0.00% |
| B_18 | 0 | 0.00% |
| D_75 | 0 | 0.00% |
| customer_ID | 0 | 0.00% |
In [18]:
### Dropping Columns with more than 20% nulls
# Dropping Columns with more than 20% Nulls
# calculate % if missing
# v2 = df.isnull().mean()*100
# # filter out columns with more than 20% nulls
# cols_to_drop = v2[v2>20].index
# # droppping these columns from the dataframe
# df.drop(columns=cols_to_drop, inplace=True)
# # recalculate nulls
# v1 = df.isnull().sum()
# v2 = ((df.isnull().sum()/len(df))*100).round(5)
# [f'{percent:.2f}%' for percent in v2]
# null_df2 = pd.DataFrame({'Null Value Count':v1,
# 'Null Value Percent':[f'{percent:.2f}%' for percent in v2]})
# null_df2.sort_values(by='Null Value Count', ascending=False)
Recheck for Missing Values¶
In [20]:
# # See if there any null values
# v1 = df.isnull().sum()
# v2 = ((df.isnull().sum()/len(df))*100).round(5)
# [f'{percent:.2f}%' for percent in v2]
# null_df = pd.DataFrame({'Null Value Count':v1,
# 'Null Value Percent':[f'{percent:.2f}%' for percent in v2]})
# null_df.sort_values(by='Null Value Count', ascending=False)
In [ ]:
In [ ]:
Imputing Categorical Columns with Mode and Numerical with Mean¶
In [21]:
categorical_columns = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117',
'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
for column in categorical_columns:
df[column] = df[column].fillna(df[column].mode()[0])
numerical_columns = df.select_dtypes(include=[np.number]).columns
for column in numerical_columns:
df[column] = df[column].fillna(df[column].median())
In [24]:
for col in categorical_columns:
print(df[col].value_counts())
print('\n')
B_30 0.0 653606 1.0 106573 2.0 7583 Name: count, dtype: int64 B_38 2.0 272273 3.0 174347 1.0 159901 5.0 62089 4.0 41485 7.0 35239 6.0 22428 Name: count, dtype: int64 D_114 1.0 482721 0.0 285041 Name: count, dtype: int64 D_116 0.0 766895 1.0 867 Name: count, dtype: int64 D_117 -1.0 225881 3.0 161239 4.0 158614 2.0 93755 5.0 63263 6.0 48314 1.0 16696 Name: count, dtype: int64 D_120 0.0 680865 1.0 86897 Name: count, dtype: int64 D_126 1.0 608319 0.0 122394 -1.0 37049 Name: count, dtype: int64 D_63 CO 572746 CR 128750 CL 60596 XZ 3388 XM 1287 XL 995 Name: count, dtype: int64 D_64 O 431797 U 213302 R 117473 -1 5190 Name: count, dtype: int64 D_66 1.0 766935 0.0 827 Name: count, dtype: int64 D_68 6.0 413529 5.0 168788 4.0 67194 3.0 66739 2.0 30538 1.0 18811 0.0 2163 Name: count, dtype: int64
Categorical Data Encoding¶
In [25]:
columns_to_encode = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_68']
cat_df_encoded = pd.get_dummies(df, columns=columns_to_encode, prefix=columns_to_encode, dtype=int)
In [26]:
new_columns = [col for col in cat_df_encoded.columns if col not in df.columns]
print(new_columns)
['B_30_0.0', 'B_30_1.0', 'B_30_2.0', 'B_38_1.0', 'B_38_2.0', 'B_38_3.0', 'B_38_4.0', 'B_38_5.0', 'B_38_6.0', 'B_38_7.0', 'D_114_0.0', 'D_114_1.0', 'D_116_0.0', 'D_116_1.0', 'D_117_-1.0', 'D_117_1.0', 'D_117_2.0', 'D_117_3.0', 'D_117_4.0', 'D_117_5.0', 'D_117_6.0', 'D_120_0.0', 'D_120_1.0', 'D_126_-1.0', 'D_126_0.0', 'D_126_1.0', 'D_63_CL', 'D_63_CO', 'D_63_CR', 'D_63_XL', 'D_63_XM', 'D_63_XZ', 'D_64_-1', 'D_64_O', 'D_64_R', 'D_64_U', 'D_68_0.0', 'D_68_1.0', 'D_68_2.0', 'D_68_3.0', 'D_68_4.0', 'D_68_5.0', 'D_68_6.0']
In [27]:
df2 = cat_df_encoded
In [28]:
df2.head()
Out[28]:
| customer_ID | S_2 | target | B_1 | B_10 | B_11 | B_12 | B_13 | B_14 | B_15 | B_16 | B_17 | B_18 | B_19 | B_2 | B_20 | B_21 | B_22 | B_23 | B_24 | B_25 | B_26 | B_27 | B_28 | B_29 | B_3 | B_31 | B_32 | B_33 | B_36 | B_37 | B_39 | B_4 | B_40 | B_41 | B_42 | B_5 | B_6 | B_7 | B_8 | B_9 | D_102 | D_103 | D_104 | D_105 | D_106 | D_107 | D_108 | D_109 | D_110 | D_111 | D_112 | D_113 | D_115 | D_118 | D_119 | D_121 | D_122 | D_123 | D_124 | D_125 | D_127 | D_128 | D_129 | D_130 | D_131 | D_132 | D_133 | D_134 | D_135 | D_136 | D_137 | D_138 | D_139 | D_140 | D_141 | D_142 | D_143 | D_144 | D_145 | D_39 | D_41 | D_42 | D_43 | D_44 | D_45 | D_46 | D_47 | D_48 | D_49 | D_50 | D_51 | D_52 | D_53 | D_54 | D_55 | D_56 | D_58 | D_59 | D_60 | D_61 | D_62 | D_65 | D_66 | D_69 | D_70 | D_71 | D_72 | D_73 | D_74 | D_75 | D_76 | D_77 | D_78 | D_79 | D_80 | D_81 | D_82 | D_83 | D_84 | D_86 | D_87 | D_88 | D_89 | D_91 | D_92 | D_93 | D_94 | D_96 | P_2 | P_3 | P_4 | R_1 | R_10 | R_11 | R_12 | R_13 | R_14 | R_15 | R_16 | R_17 | R_18 | R_19 | R_2 | R_20 | R_21 | R_22 | R_23 | R_24 | R_25 | R_26 | R_27 | R_28 | R_3 | R_4 | R_5 | R_6 | R_7 | R_8 | R_9 | S_11 | S_12 | S_13 | S_15 | S_16 | S_17 | S_18 | S_19 | S_20 | S_22 | S_23 | S_24 | S_25 | S_26 | S_27 | S_3 | S_5 | S_6 | S_7 | S_8 | S_9 | B_30_0.0 | B_30_1.0 | B_30_2.0 | B_38_1.0 | B_38_2.0 | B_38_3.0 | B_38_4.0 | B_38_5.0 | B_38_6.0 | B_38_7.0 | D_114_0.0 | D_114_1.0 | D_116_0.0 | D_116_1.0 | D_117_-1.0 | D_117_1.0 | D_117_2.0 | D_117_3.0 | D_117_4.0 | D_117_5.0 | D_117_6.0 | D_120_0.0 | D_120_1.0 | D_126_-1.0 | D_126_0.0 | D_126_1.0 | D_63_CL | D_63_CO | D_63_CR | D_63_XL | D_63_XM | D_63_XZ | D_64_-1 | D_64_O | D_64_R | D_64_U | D_68_0.0 | D_68_1.0 | D_68_2.0 | D_68_3.0 | D_68_4.0 | D_68_5.0 | D_68_6.0 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1001678 | e82426907e83eff6314d6932387e05980260b6a504a859378e40f6148de77e9b | 2018-01-30 | 0 | 0.092478 | 0.024581 | 0.063496 | 0.006179 | 0.023176 | 0.022227 | 0.009837 | 0.758565 | 0.932517 | 0.287218 | 0.121374 | 0.222187 | 0.118112 | 0.007755 | 0.006110 | 0.273957 | 0.004579 | 0.039733 | 0.000247 | 0.002718 | 0.102830 | 0.005099 | 0.065400 | 1 | 0.009412 | 0.001723 | 0.000011 | 0.088204 | 0.14597 | 0.278010 | 0.310617 | 0.002795 | 0.022975 | 0.012549 | 0.021231 | 0.320102 | 1.000424 | 0.516240 | 0.424792 | 1.008174 | 0.960303 | 0.497643 | 0.136416 | 0.674171 | 0.005392 | 0.001766 | 0.891639 | 1.003717 | 1.007348 | 0.208180 | 0.037980 | 0.042318 | 0.033467 | 0.082271 | 0.294749 | 0.009710 | 0.006331 | 0.001693 | 0.009782 | 0.009530 | 0.004320 | 0.006810 | 0.000972 | 0.16179 | 0.008678 | 0.211991 | 0.005103 | 0.253976 | 0.005023 | 0.006987 | 0.002402 | 0.004553 | 0.002616 | 0.378180 | 0.004751 | 0.002337 | 0.007682 | 0.685093 | 0.006109 | 0.12183 | 0.116663 | 0.504001 | 0.047147 | 0.538970 | 0.080949 | 0.815861 | 0.13003 | 0.109446 | 0.008428 | 0.045970 | 0.013349 | 1.008537 | 0.548011 | 0.149113 | 0.505297 | 0.468230 | 0.092569 | 0.831179 | 0.022596 | 0.008995 | 1.0 | 0.005642 | 0.251570 | 0.013525 | 0.337655 | 0.102926 | 0.288860 | 0.334192 | 0.05855 | 0.205999 | 0.007066 | 0.006606 | 0.004961 | 0.005136 | 0.009484 | 0.005704 | 0.004489 | 0.004797 | 1.0 | 0.081778 | 0.006549 | 0.004516 | 0.001358 | 0.009264 | 0.001387 | 0.000945 | 0.423450 | 0.696847 | 0.009930 | 0.006237 | 0.004581 | 0.008294 | 1.000849 | 0.002636 | 0.001108 | 0.001155 | 0.000026 | 0.002033 | 0.004208 | 0.005400 | 0.007020 | 0.004571 | 0.001855 | 0.003639 | 0.003954 | 0.008042 | 0.001306 | 0.03644 | 1.009312 | 0.002486 | 0.201494 | 0.007820 | 0.000448 | 0.005406 | 0.007668 | 0.001542 | 0.172798 | 0.446841 | 0.190555 | 0.005894 | 0.801286 | 0.004133 | 0.003098 | 0.006537 | 0.002140 | 0.004594 | 0.949931 | 0.140650 | 0.963861 | 0.971872 | 0.009365 | 0.004759 | 0.415000 | 0.029318 | 0.001722 | 0.345205 | 0.108934 | 0.019381 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 698107 | a1ece17ade53583a522f0fec8c7e8c79085a24c9377bba1ff4fdce232b4da214 | 2017-08-30 | 1 | 0.069503 | 0.101049 | 0.070815 | 0.626662 | 0.365552 | 0.785919 | 0.922997 | 1.004107 | 0.915636 | 0.134597 | 0.086449 | 0.300279 | 1.007248 | 0.002313 | 0.503756 | 0.063787 | 0.958598 | 0.007392 | 0.009718 | 0.001148 | 0.397575 | 0.001723 | 0.430102 | 1 | 0.002739 | 0.008025 | 0.003982 | 0.063742 | 0.14597 | 0.174456 | 0.055480 | 0.007250 | 0.022975 | 0.001694 | 0.069223 | 0.063528 | 1.004163 | 0.007677 | 0.122184 | 1.004207 | 0.950343 | 0.071197 | 0.136416 | 0.339161 | 0.005392 | 0.001980 | 0.891639 | 1.003717 | 0.006020 | 0.200886 | 0.119490 | 0.114635 | 0.113965 | 0.399811 | 0.292976 | 0.008145 | 0.368086 | 0.002370 | 0.009576 | 0.004705 | 0.001926 | 0.007697 | 0.006051 | 0.16179 | 0.007495 | 0.211991 | 0.005103 | 0.253976 | 0.005023 | 0.006987 | 0.003982 | 0.009797 | 0.002361 | 0.378180 | 0.005402 | 0.004501 | 0.006795 | 0.800477 | 0.641640 | 0.12183 | 0.096170 | 0.125717 | 0.106280 | 0.481777 | 0.573031 | 0.046104 | 0.13003 | 0.109446 | 0.002313 | 0.481761 | 0.010404 | 1.003439 | 0.148661 | 0.149113 | 0.082128 | 0.316978 | 0.924212 | 0.007875 | 0.037178 | 0.000487 | 1.0 | 0.005549 | 0.253756 | 0.075213 | 0.009911 | 0.102926 | 0.078010 | 0.135256 | 0.05855 | 0.044626 | 0.509549 | 0.008299 | 0.001804 | 0.005779 | 0.505277 | 0.007095 | 0.501574 | 0.009669 | 1.0 | 0.081778 | 0.009226 | 0.009876 | 0.007885 | 0.009767 | 0.007391 | 0.008429 | 0.363639 | 0.269508 | 0.001070 | 0.001198 | 0.009938 | 0.008982 | 0.054798 | 0.003986 | 0.002093 | 0.006496 | 0.005303 | 0.007658 | 0.001471 | 0.002264 | 1.006851 | 0.000424 | 0.001323 | 0.001392 | 0.009447 | 0.001465 | 0.000372 | 0.03644 | 1.006273 | 0.005403 | 0.100891 | 0.006563 | 0.503488 | 0.010675 | 0.182436 | 1.007199 | 0.172798 | 0.648812 | 0.269872 | 1.000615 | 0.306887 | 0.006539 | 0.001750 | 1.001116 | 0.004646 | 0.002871 | 0.986852 | 0.881867 | 1.019099 | 0.891789 | 0.019339 | 0.320809 | 0.175189 | 0.704642 | 0.009834 | 0.473014 | 0.496838 | 0.020089 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 451750 | 690f87ac1353a1ef667fa941e321477102811b642e71bda7ee31f31e0a324575 | 2017-03-18 | 1 | 0.550235 | 0.014174 | 0.489797 | 0.007269 | 0.006078 | 0.106068 | 0.001858 | 1.000686 | 0.985567 | 0.198068 | 1.007074 | 0.028153 | 1.008544 | 0.004195 | 0.505404 | 0.595276 | 0.000755 | 0.178551 | 0.004988 | 0.009959 | 0.087017 | 0.005099 | 0.907538 | 1 | 0.004282 | 0.006677 | 0.009813 | 0.552285 | 0.14597 | 0.303336 | 0.282492 | 0.008354 | 0.022975 | 0.016795 | 0.009934 | 0.691952 | 1.007928 | 0.645657 | 0.009052 | 0.003526 | 0.009925 | 0.333975 | 0.136416 | 0.007719 | 0.005392 | 0.007642 | 0.891639 | 1.003717 | 1.009590 | 0.200134 | 0.028547 | 0.281556 | 0.284744 | 0.233213 | 0.148046 | 0.004905 | 0.092844 | 0.001623 | 0.002265 | 0.001137 | 0.008589 | 0.005200 | 0.009643 | 0.16179 | 0.009668 | 0.211991 | 0.005103 | 0.253976 | 0.005023 | 0.006987 | 0.005393 | 0.002461 | 0.006307 | 0.378180 | 0.002981 | 0.001905 | 0.000468 | 0.000191 | 0.001918 | 0.12183 | 0.192925 | 0.255509 | 0.099420 | 0.486834 | 0.334910 | 0.757523 | 0.13003 | 0.109446 | 0.009742 | 0.011181 | 0.013349 | 1.003327 | 0.551844 | 0.149113 | 0.398534 | 0.238379 | 0.539913 | 0.935858 | 0.007404 | 0.007285 | 1.0 | 0.000732 | 0.506233 | 0.010650 | 0.006161 | 0.102926 | 0.218794 | 0.276046 | 0.05855 | 0.205999 | 0.009848 | 0.005611 | 0.002026 | 0.003054 | 0.504428 | 0.009692 | 0.009721 | 0.006141 | 1.0 | 0.081778 | 0.001494 | 0.007583 | 0.009587 | 0.003169 | 0.007167 | 0.001000 | 0.472415 | 0.412725 | 0.001032 | 0.004999 | 0.005112 | 0.001965 | 1.005624 | 0.004845 | 0.000833 | 0.002345 | 0.003151 | 0.007910 | 0.005808 | 0.007583 | 0.007328 | 0.006428 | 0.006203 | 0.001365 | 0.000948 | 0.006021 | 0.003025 | 0.03644 | 1.005733 | 0.003526 | 0.008849 | 0.003744 | 0.000786 | 0.001697 | 0.006504 | 0.001919 | 0.172798 | 0.284922 | 0.188889 | 0.007357 | 0.506795 | 0.006982 | 0.004230 | 0.003931 | 0.002829 | 0.009914 | 0.970445 | 0.686452 | 0.960000 | 0.684947 | 0.001207 | 0.009092 | 0.369557 | 0.050117 | 0.007323 | 0.273249 | 0.000872 | 0.019381 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 336998 | 4e41c8de6ad791f70548dd1e3f5dd7641885180c240e57aa1122aab230ddf9e6 | 2017-08-28 | 0 | 0.001704 | 0.295186 | 0.009168 | 0.059050 | 0.015061 | 0.004249 | 0.003515 | 0.009922 | 0.932517 | 1.003641 | 0.002695 | 0.812935 | 0.005447 | 0.004822 | 0.004476 | 0.026788 | 0.002857 | 0.002045 | 0.004380 | 0.006927 | 0.065456 | 0.005099 | 0.009482 | 1 | 0.004631 | 1.001611 | 0.005051 | 0.007671 | 0.14597 | 0.059279 | 0.036768 | 0.005562 | 0.022975 | 0.034604 | 0.177883 | 0.033266 | 0.003200 | 0.001994 | 0.000987 | 0.006254 | 0.001904 | 0.333975 | 0.136416 | 0.004313 | 0.005392 | 0.009597 | 0.891639 | 1.003717 | 1.004763 | 0.208888 | 0.836221 | 0.818625 | 0.819371 | 0.732808 | 0.430107 | 0.000682 | 0.138326 | 0.002661 | 0.005841 | 1.004648 | 1.004891 | 0.008831 | 0.003620 | 0.16179 | 0.008020 | 0.211991 | 0.005103 | 0.253976 | 0.005023 | 0.006987 | 0.005000 | 0.001946 | 0.001037 | 0.378180 | 0.004152 | 0.006598 | 0.008179 | 0.007660 | 0.004461 | 0.12183 | 0.088647 | 0.001576 | 0.317622 | 0.423614 | 0.668637 | 0.108188 | 0.13003 | 0.290106 | 0.004070 | 0.228856 | 0.013349 | 1.001418 | 0.148305 | 0.115426 | 0.003264 | 0.170666 | 0.073078 | 0.107157 | 0.503322 | 0.003896 | 1.0 | 0.007260 | 0.007641 | 0.094936 | 0.007068 | 0.102926 | 0.003147 | 0.009368 | 0.05855 | 0.467864 | 0.004748 | 0.008386 | 0.001245 | 0.006583 | 0.504227 | 0.005105 | 0.009094 | 0.002025 | 1.0 | 0.081778 | 0.002630 | 0.002007 | 0.008523 | 0.003001 | 0.005839 | 0.008463 | 1.004331 | 0.623208 | 0.004379 | 0.008625 | 0.000019 | 0.001013 | 1.004802 | 0.000863 | 0.003678 | 0.005060 | 0.002457 | 0.005356 | 0.002545 | 0.005371 | 0.007462 | 0.008011 | 0.008939 | 0.006929 | 0.003677 | 0.005429 | 0.002318 | 0.03644 | 1.008289 | 0.000226 | 0.002526 | 0.005590 | 0.005532 | 0.005212 | 0.007322 | 0.003281 | 0.172798 | 0.285991 | 0.186971 | 0.004755 | 0.508953 | 0.001270 | 0.009659 | 0.008348 | 0.001551 | 0.002803 | 0.297968 | 0.139452 | 0.080747 | 0.974545 | 0.001003 | 0.287958 | 0.163977 | 0.008608 | 1.005079 | 0.139884 | 0.004941 | 0.019381 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 446688 | 67de95d0e4bb392f2384d45f5d1944269de0cb37e5f2b77ceb7107c6ef18c05a | 2017-10-09 | 1 | 0.277471 | 0.018577 | 0.231653 | 0.014207 | 0.025692 | 0.066600 | 0.008354 | 1.007376 | 1.001243 | 0.209912 | 0.368584 | 0.083958 | 0.948962 | 0.007272 | 0.009839 | 0.250800 | 0.005238 | 0.116631 | 0.001909 | 0.005859 | 0.087666 | 0.005099 | 0.361439 | 1 | 0.003556 | 0.003434 | 0.005899 | 0.278873 | 0.14597 | 0.196832 | 0.304891 | 0.000061 | 0.022975 | 0.004144 | 0.010874 | 0.296214 | 1.008416 | 0.482246 | 0.003631 | 0.002465 | 0.003741 | 0.333975 | 0.136416 | 0.007364 | 0.005392 | 0.006433 | 0.891639 | 1.003717 | 0.005043 | 0.203524 | 0.059302 | 0.061084 | 0.055094 | 0.145271 | 0.434661 | 0.001164 | 0.050673 | 0.008965 | 0.008984 | 1.003624 | 1.004067 | 0.003678 | 0.003218 | 0.16179 | 0.002409 | 0.211991 | 0.005103 | 0.253976 | 0.005023 | 0.006987 | 1.006120 | 0.002359 | 0.931348 | 0.373401 | 1.005839 | 0.239530 | 0.459985 | 0.324037 | 0.285931 | 0.12183 | 0.078766 | 0.009928 | 0.040742 | 0.450376 | 0.134951 | 0.509138 | 0.13003 | 0.067773 | 0.003925 | 0.087683 | 0.003649 | 1.006069 | 0.579490 | 0.104387 | 0.164118 | 0.420743 | 0.365058 | 0.611261 | 0.026196 | 0.004948 | 1.0 | 0.003961 | 0.007824 | 0.004159 | 0.003793 | 0.102926 | 0.074460 | 0.138262 | 0.05855 | 0.205999 | 0.006376 | 0.006612 | 0.009926 | 0.007995 | 0.504428 | 0.009803 | 0.006562 | 0.009056 | 1.0 | 0.081778 | 0.008971 | 0.009339 | 0.000474 | 0.003738 | 0.006572 | 0.003103 | 0.543514 | 0.897214 | 0.004518 | 0.004120 | 0.005425 | 0.006901 | 1.008179 | 0.003851 | 0.006478 | 0.004763 | 0.009795 | 0.001420 | 0.008240 | 0.000049 | 0.002438 | 0.001948 | 0.007413 | 0.004663 | 0.007063 | 0.004741 | 0.005064 | 0.03644 | 1.006411 | 0.007799 | 0.005209 | 0.005652 | 0.008214 | 0.009816 | 0.003596 | 0.003489 | 0.172798 | 0.329063 | 0.189788 | 0.286854 | 0.400545 | 0.005367 | 0.004743 | 0.000409 | 0.003395 | 0.004220 | 0.953209 | 0.131797 | 0.950913 | 0.974243 | 0.005657 | 0.009277 | 0.179880 | 0.035951 | 0.000421 | 0.265130 | 0.315731 | 0.019381 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
In [ ]:
In [ ]:
Feature Engineering¶
In [29]:
df2['S_2'] = pd.to_datetime(df2['S_2'])
end_date = df2['S_2'].max()
# Date ranges
last_3_months = end_date - pd.DateOffset(months=3)
last_6_months = end_date - pd.DateOffset(months=6)
last_9_months = end_date - pd.DateOffset(months=9)
last_12_months = end_date - pd.DateOffset(months=12)
# Exclusion categorical columns
exclusion_list = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68', 'S_2']
columns_to_include = [col for col in df2.columns if not col.startswith(tuple(exclusion_list))]
# New features
df2['S_Total'] = df2[columns_to_include].filter(like='S_').sum(axis=1)
df2['P_Total'] = df2[columns_to_include].filter(like='P_').sum(axis=1)
df2['B_Total'] = df2[columns_to_include].filter(like='B_').sum(axis=1)
df2['R_Total'] = df2[columns_to_include].filter(like='R_').sum(axis=1)
df2['D_Ave'] = df2[columns_to_include].filter(like='D_').mean(axis=1)
df2['S_Ave'] = df2[columns_to_include].filter(like='S_').mean(axis=1)
df2['P_Ave'] = df2[columns_to_include].filter(like='P_').mean(axis=1)
df2['B_Ave'] = df2[columns_to_include].filter(like='B_').mean(axis=1)
df2['R_Ave'] = df2[columns_to_include].filter(like='R_').mean(axis=1)
df2['D_Ave'] = df2[columns_to_include].filter(like='D_').mean(axis=1)
df2['S_Ave'] = df2[columns_to_include].filter(like='S_').mean(axis=1)
df2['P_Ave'] = df2[columns_to_include].filter(like='P_').mean(axis=1)
df2['B_Ave'] = df2[columns_to_include].filter(like='B_').mean(axis=1)
df2['R_Ave'] = df2[columns_to_include].filter(like='R_').mean(axis=1)
df2['S_Ave_3_months'] = df2[columns_to_include].filter(like='S_').loc[df2['S_2'] >= last_3_months].mean(axis=1)
df2['S_Ave_6_months'] = df2[columns_to_include].filter(like='S_').loc[df2['S_2'] >= last_6_months].mean(axis=1)
df2['S_Ave_9_months'] = df2[columns_to_include].filter(like='S_').loc[df2['S_2'] >= last_9_months].mean(axis=1)
df2['S_Ave_12_months'] = df2[columns_to_include].filter(like='S_').loc[df2['S_2'] >= last_12_months].mean(axis=1)
df2['P_Ave_3_months'] = df2[columns_to_include].filter(like='P_').loc[df2['S_2'] >= last_3_months].mean(axis=1)
df2['P_Ave_6_months'] = df2[columns_to_include].filter(like='P_').loc[df2['S_2'] >= last_6_months].mean(axis=1)
df2['P_Ave_9_months'] = df2[columns_to_include].filter(like='P_').loc[df2['S_2'] >= last_9_months].mean(axis=1)
df2['P_Ave_12_months'] = df2[columns_to_include].filter(like='P_').loc[df2['S_2'] >= last_12_months].mean(axis=1)
df2['R_Ave_3_months'] = df2[columns_to_include].filter(like='R_').loc[df2['S_2'] >= last_3_months].mean(axis=1)
df2['R_Ave_6_months'] = df2[columns_to_include].filter(like='R_').loc[df2['S_2'] >= last_6_months].mean(axis=1)
df2['R_Ave_9_months'] = df2[columns_to_include].filter(like='R_').loc[df2['S_2'] >= last_9_months].mean(axis=1)
df2['R_Ave_12_months'] = df2[columns_to_include].filter(like='R_').loc[df2['S_2'] >= last_12_months].mean(axis=1)
df2['B_Ave_3_months'] = df2[columns_to_include].filter(like='B_').loc[df2['S_2'] >= last_3_months].mean(axis=1)
df2['B_Ave_6_months'] = df2[columns_to_include].filter(like='B_').loc[df2['S_2'] >= last_6_months].mean(axis=1)
df2['B_Ave_9_months'] = df2[columns_to_include].filter(like='B_').loc[df2['S_2'] >= last_9_months].mean(axis=1)
df2['B_Ave_12_months'] = df2[columns_to_include].filter(like='B_').loc[df2['S_2'] >= last_12_months].mean(axis=1)
df2['D_Ave_3_months'] = df2[columns_to_include].filter(like='D_').loc[df2['S_2'] >= last_3_months].mean(axis=1)
df2['D_Ave_6_months'] = df2[columns_to_include].filter(like='D_').loc[df2['S_2'] >= last_6_months].mean(axis=1)
df2['D_Ave_9_months'] = df2[columns_to_include].filter(like='D_').loc[df2['S_2'] >= last_9_months].mean(axis=1)
df2['D_Ave_12_months'] = df2[columns_to_include].filter(like='D_').loc[df2['S_2'] >= last_12_months].mean(axis=1)
In [30]:
print(list(df2.columns))
['customer_ID', 'S_2', 'target', 'B_1', 'B_10', 'B_11', 'B_12', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_18', 'B_19', 'B_2', 'B_20', 'B_21', 'B_22', 'B_23', 'B_24', 'B_25', 'B_26', 'B_27', 'B_28', 'B_29', 'B_3', 'B_31', 'B_32', 'B_33', 'B_36', 'B_37', 'B_39', 'B_4', 'B_40', 'B_41', 'B_42', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9', 'D_102', 'D_103', 'D_104', 'D_105', 'D_106', 'D_107', 'D_108', 'D_109', 'D_110', 'D_111', 'D_112', 'D_113', 'D_115', 'D_118', 'D_119', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_127', 'D_128', 'D_129', 'D_130', 'D_131', 'D_132', 'D_133', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143', 'D_144', 'D_145', 'D_39', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_49', 'D_50', 'D_51', 'D_52', 'D_53', 'D_54', 'D_55', 'D_56', 'D_58', 'D_59', 'D_60', 'D_61', 'D_62', 'D_65', 'D_66', 'D_69', 'D_70', 'D_71', 'D_72', 'D_73', 'D_74', 'D_75', 'D_76', 'D_77', 'D_78', 'D_79', 'D_80', 'D_81', 'D_82', 'D_83', 'D_84', 'D_86', 'D_87', 'D_88', 'D_89', 'D_91', 'D_92', 'D_93', 'D_94', 'D_96', 'P_2', 'P_3', 'P_4', 'R_1', 'R_10', 'R_11', 'R_12', 'R_13', 'R_14', 'R_15', 'R_16', 'R_17', 'R_18', 'R_19', 'R_2', 'R_20', 'R_21', 'R_22', 'R_23', 'R_24', 'R_25', 'R_26', 'R_27', 'R_28', 'R_3', 'R_4', 'R_5', 'R_6', 'R_7', 'R_8', 'R_9', 'S_11', 'S_12', 'S_13', 'S_15', 'S_16', 'S_17', 'S_18', 'S_19', 'S_20', 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'S_27', 'S_3', 'S_5', 'S_6', 'S_7', 'S_8', 'S_9', 'B_30_0.0', 'B_30_1.0', 'B_30_2.0', 'B_38_1.0', 'B_38_2.0', 'B_38_3.0', 'B_38_4.0', 'B_38_5.0', 'B_38_6.0', 'B_38_7.0', 'D_114_0.0', 'D_114_1.0', 'D_116_0.0', 'D_116_1.0', 'D_117_-1.0', 'D_117_1.0', 'D_117_2.0', 'D_117_3.0', 'D_117_4.0', 'D_117_5.0', 'D_117_6.0', 'D_120_0.0', 'D_120_1.0', 'D_126_-1.0', 'D_126_0.0', 'D_126_1.0', 'D_63_CL', 'D_63_CO', 'D_63_CR', 'D_63_XL', 'D_63_XM', 'D_63_XZ', 'D_64_-1', 'D_64_O', 'D_64_R', 'D_64_U', 'D_68_0.0', 'D_68_1.0', 'D_68_2.0', 'D_68_3.0', 'D_68_4.0', 'D_68_5.0', 'D_68_6.0', 'S_Total', 'P_Total', 'B_Total', 'R_Total', 'D_Ave', 'S_Ave', 'P_Ave', 'B_Ave', 'R_Ave', 'S_Ave_3_months', 'S_Ave_6_months', 'S_Ave_9_months', 'S_Ave_12_months', 'P_Ave_3_months', 'P_Ave_6_months', 'P_Ave_9_months', 'P_Ave_12_months', 'R_Ave_3_months', 'R_Ave_6_months', 'R_Ave_9_months', 'R_Ave_12_months', 'B_Ave_3_months', 'B_Ave_6_months', 'B_Ave_9_months', 'B_Ave_12_months', 'D_Ave_3_months', 'D_Ave_6_months', 'D_Ave_9_months', 'D_Ave_12_months']
In [ ]:
Split the DataFrame¶
In [31]:
X = df2.drop(['customer_ID', 'target', 'S_2'], axis=1)
y = df2['target']
from sklearn.model_selection import train_test_split
# split 1
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
# split 2
X_test1, X_test2, y_test1, y_test2 = train_test_split(X_temp, y_temp, test_size=0.5, random_state=59)
In [32]:
print(X_train.shape)
print(y_train.shape)
print(X_test1.shape)
print(y_test1.shape)
print(X_test2.shape)
print(y_test2.shape)
(537433, 250) (537433,) (115164, 250) (115164,) (115165, 250) (115165,)
In [33]:
# X_train.to_csv('X_train.csv', index=False)
# y_train.to_csv('y_train.csv', index=False)
# X_test1.to_csv('X_test1.csv', index=False)
# y_test1.to_csv('y_test1.csv', index=False)
# X_test2.to_csv('X_test2.csv', index=False)
# y_test2.to_csv('y_test2.csv', index=False)
XGBoost Model 1¶
In [34]:
from xgboost import XGBClassifier
xgb_m1 = XGBClassifier(random_state=1, use_label_encoder=False, seed=69)
xgb_m1.fit(X_train, y_train)
Out[34]:
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=1, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=1, ...)In [36]:
# Get feature importances
feature_importances = xgb_m1.feature_importances_
# Print feature importance for features with importance > 0.5%
print("Features with importance higher than 0.5%:")
for feature_name, importance in zip(X_train.columns, feature_importances):
if importance > 0.005:
print(f'Feature: {feature_name}, Importance: {importance}')
Features with importance higher than 0.5%: Feature: B_1, Importance: 0.062093429267406464 Feature: B_10, Importance: 0.005027878098189831 Feature: B_2, Importance: 0.019205378368496895 Feature: B_3, Importance: 0.011666232720017433 Feature: B_4, Importance: 0.005044857505708933 Feature: B_7, Importance: 0.010539853014051914 Feature: B_9, Importance: 0.02490614727139473 Feature: D_112, Importance: 0.005575456190854311 Feature: D_129, Importance: 0.005559643264859915 Feature: D_41, Importance: 0.00764946173876524 Feature: D_42, Importance: 0.018204940482974052 Feature: D_43, Importance: 0.006187329534441233 Feature: D_44, Importance: 0.0065676490776240826 Feature: D_45, Importance: 0.008879962377250195 Feature: D_46, Importance: 0.007054412737488747 Feature: D_48, Importance: 0.011242859996855259 Feature: D_51, Importance: 0.010159661062061787 Feature: D_75, Importance: 0.006474910769611597 Feature: D_79, Importance: 0.007479172199964523 Feature: P_2, Importance: 0.22402159869670868 Feature: R_1, Importance: 0.01718958094716072 Feature: R_27, Importance: 0.01262607891112566 Feature: S_23, Importance: 0.007821113802492619 Feature: S_3, Importance: 0.011532943695783615 Feature: B_38_2.0, Importance: 0.012651875615119934 Feature: B_38_4.0, Importance: 0.006722653284668922 Feature: B_Total, Importance: 0.006749620195478201 Feature: R_Total, Importance: 0.013241227716207504 Feature: B_Ave, Importance: 0.007870757952332497 Feature: R_Ave, Importance: 0.05340268090367317 Feature: R_Ave_12_months, Importance: 0.005185040645301342 Feature: B_Ave_3_months, Importance: 0.005713469814509153 Feature: B_Ave_6_months, Importance: 0.006645355373620987
In [37]:
feature_importances = xgb_m1.feature_importances_
# Create a DataFrame for feature importances
feature_importance_df = pd.DataFrame({
'Feature': X_train.columns,
'Importance': feature_importances
})
# Filter the DataFrame for features with importance > 0.5%
important_features_df = feature_importance_df[feature_importance_df['Importance'] > 0.005]
# Sort the DataFrame based on the importance in descending order
important_features_df = important_features_df.sort_values(by='Importance', ascending=False)
# print("Features with importance higher than 0.5%:")
# print(important_features_df)
# Plotting
plt.figure(figsize=(10, 8)) # Adjust the figure size as necessary
sns.barplot(x='Importance', y='Feature', data=important_features_df.sort_values('Importance', ascending=False))
plt.title('Feature Importance (>0.5%)')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()
In [38]:
important_features_1 = []
for feature_name, importance in zip(X_train.columns, feature_importances):
if importance > 0.005: # Check if feature importance is greater than 0.5%
important_features_1.append(feature_name)
print("Important features:", important_features_1)
Important features: ['B_1', 'B_10', 'B_2', 'B_3', 'B_4', 'B_7', 'B_9', 'D_112', 'D_129', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_48', 'D_51', 'D_75', 'D_79', 'P_2', 'R_1', 'R_27', 'S_23', 'S_3', 'B_38_2.0', 'B_38_4.0', 'B_Total', 'R_Total', 'B_Ave', 'R_Ave', 'R_Ave_12_months', 'B_Ave_3_months', 'B_Ave_6_months']
XGBoost Model 2¶
In [39]:
xgb_m2 = XGBClassifier(
n_estimators = 300,
learning_rate = 0.5,
max_depth = 4,
subsample=0.5, # Use 50% of observations to build each tree
colsample_bytree=0.5, # Use 50% of features to build each tree
scale_pos_weight=5, # Assign a weight of 5 to default observations
eval_metric='logloss',
use_label_encoder=False,
seed=42
)
xgb_m2.fit(X_train, y_train)
Out[39]:
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=0.5, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric='logloss',
feature_types=None, gamma=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.5, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=4,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None, n_estimators=300,
n_jobs=None, num_parallel_tree=None, random_state=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=0.5, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric='logloss',
feature_types=None, gamma=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.5, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=4,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None, n_estimators=300,
n_jobs=None, num_parallel_tree=None, random_state=None, ...)In [40]:
# Get feature importances
feature_importances_2 = xgb_m2.feature_importances_
# Print feature importance for features with importance > 0.5%
print("Features with importance higher than 0.5%:")
for feature_name, importance in zip(X_train.columns, feature_importances_2):
if importance > 0.005:
print(f'Feature: {feature_name}, Importance: {importance}')
Features with importance higher than 0.5%: Feature: B_1, Importance: 0.009449629113078117 Feature: B_3, Importance: 0.007355320733040571 Feature: B_37, Importance: 0.12651139497756958 Feature: B_9, Importance: 0.02417602762579918 Feature: D_132, Importance: 0.006089573726058006 Feature: D_41, Importance: 0.0075592705979943275 Feature: D_42, Importance: 0.03347364068031311 Feature: D_43, Importance: 0.005484941881150007 Feature: D_45, Importance: 0.007913424633443356 Feature: D_48, Importance: 0.1390070617198944 Feature: D_49, Importance: 0.005143444053828716 Feature: D_51, Importance: 0.007368069142103195 Feature: D_52, Importance: 0.009602224454283714 Feature: D_75, Importance: 0.016339194029569626 Feature: P_2, Importance: 0.06553985178470612 Feature: R_26, Importance: 0.007541041821241379 Feature: S_3, Importance: 0.012038648128509521 Feature: S_7, Importance: 0.010565096512436867 Feature: P_Total, Importance: 0.011251780204474926 Feature: B_Ave, Importance: 0.006103881634771824 Feature: R_Ave, Importance: 0.025396572425961494 Feature: S_Ave_6_months, Importance: 0.005313452798873186
In [41]:
feature_importances_2 = xgb_m2.feature_importances_
# Create a DataFrame for feature importances
feature_importance_df_2 = pd.DataFrame({
'Feature': X_train.columns,
'Importance': feature_importances_2
})
# Filter the DataFrame for features with importance > 0.5%
important_features_df_2 = feature_importance_df_2[feature_importance_df['Importance'] > 0.005]
# Sort the DataFrame based on the importance in descending order
important_features_df_2 = important_features_df_2.sort_values(by='Importance', ascending=False)
# print("Features with importance higher than 0.5%:")
# print(important_features_df_2)
# Plotting
plt.figure(figsize=(10, 8)) # Adjust the figure size as necessary
sns.barplot(x='Importance', y='Feature', data=important_features_df_2.sort_values('Importance', ascending=False))
plt.title('Feature Importance (>0.5%)')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()
In [42]:
important_features_2 = []
for feature_name, importance in zip(X_train.columns, feature_importances_2):
if importance > 0.005: # Check if feature importance is greater than 0.5%
important_features_2.append(feature_name)
print("Important features:", important_features_2)
Important features: ['B_1', 'B_3', 'B_37', 'B_9', 'D_132', 'D_41', 'D_42', 'D_43', 'D_45', 'D_48', 'D_49', 'D_51', 'D_52', 'D_75', 'P_2', 'R_26', 'S_3', 'S_7', 'P_Total', 'B_Ave', 'R_Ave', 'S_Ave_6_months']
Consolidate Most Important Features from Both the Models¶
In [43]:
print(important_features_1)
print('\n')
print(important_features_2)
print('\n')
# get a consolidated list of all these features:
unique_elements = set(important_features_1).union(set(important_features_2))
features = list(unique_elements)
print(features)
['B_1', 'B_10', 'B_2', 'B_3', 'B_4', 'B_7', 'B_9', 'D_112', 'D_129', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_48', 'D_51', 'D_75', 'D_79', 'P_2', 'R_1', 'R_27', 'S_23', 'S_3', 'B_38_2.0', 'B_38_4.0', 'B_Total', 'R_Total', 'B_Ave', 'R_Ave', 'R_Ave_12_months', 'B_Ave_3_months', 'B_Ave_6_months'] ['B_1', 'B_3', 'B_37', 'B_9', 'D_132', 'D_41', 'D_42', 'D_43', 'D_45', 'D_48', 'D_49', 'D_51', 'D_52', 'D_75', 'P_2', 'R_26', 'S_3', 'S_7', 'P_Total', 'B_Ave', 'R_Ave', 'S_Ave_6_months'] ['D_52', 'B_Ave_3_months', 'D_112', 'B_38_2.0', 'B_2', 'R_Ave', 'R_Ave_12_months', 'S_7', 'R_1', 'B_38_4.0', 'R_26', 'B_7', 'D_41', 'P_2', 'D_129', 'D_132', 'B_3', 'D_49', 'D_45', 'D_44', 'B_1', 'D_46', 'B_37', 'P_Total', 'B_Total', 'D_51', 'S_Ave_6_months', 'D_75', 'R_27', 'B_9', 'B_Ave_6_months', 'D_79', 'D_48', 'S_23', 'D_43', 'B_4', 'B_10', 'B_Ave', 'R_Total', 'S_3', 'D_42']
In [ ]:
Setting Aside the Data For Strategy¶
In [47]:
columns_to_include = ["customer_ID", "target", "S_2"] + features
# Create df3 by selecting the specified columns from df2
df3 = df2[columns_to_include]
In [ ]:
Recreate Training and Testing Data to Include Only the above Features¶
In [48]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
X = df3.drop(['customer_ID','S_2','target'], axis=1)
y = df3['target']
X_train, X_test1, y_train, y_test1 = train_test_split(X, y, test_size=0.3, random_state=42)
X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test1, y_test1, test_size=0.5, random_state=42)
In [49]:
print(X_train.shape)
print(y_train.shape)
print(X_test1.shape)
print(y_test1.shape)
print(X_test2.shape)
print(y_test2.shape)
(537433, 41) (537433,) (115164, 41) (115164,) (115165, 41) (115165,)
XGBoost GridSearch¶
In [50]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
param_grid = {
'n_estimators': [50, 100, 300],
'learning_rate': [0.01, 0.1],
'subsample': [0.5, 0.8], # % of obs each tree
'colsample_bytree': [0.5, 1.0], # % of features each tree
'scale_pos_weight': [1, 5, 10] # Weight of default observations
}
xgb_clf = XGBClassifier(use_label_encoder=False, seed=4)
grid_search_xgb = GridSearchCV(estimator=xgb_clf,
param_grid=param_grid,
scoring='roc_auc',
cv=3, verbose=3)
grid_search_xgb.fit(X_train, y_train)
# best_model = grid_search_xgb.best_estimator_yes
Fitting 3 folds for each of 72 candidates, totalling 216 fits [CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.926 total time= 2.5s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.927 total time= 2.7s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.928 total time= 2.6s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.926 total time= 2.4s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.927 total time= 2.5s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.928 total time= 2.8s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.925 total time= 2.5s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.926 total time= 3.3s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.927 total time= 2.7s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.925 total time= 2.9s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.926 total time= 2.9s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.927 total time= 2.7s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.924 total time= 2.6s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.925 total time= 2.6s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.926 total time= 2.5s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.924 total time= 2.6s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.925 total time= 2.5s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.926 total time= 2.6s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.928 total time= 3.9s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.929 total time= 3.9s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.930 total time= 3.9s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.928 total time= 3.8s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.929 total time= 3.9s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.930 total time= 4.0s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.927 total time= 4.0s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.927 total time= 4.1s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.929 total time= 3.9s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.927 total time= 4.0s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.927 total time= 4.0s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.928 total time= 4.0s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.926 total time= 4.0s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.927 total time= 4.0s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.928 total time= 4.0s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.926 total time= 4.1s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.927 total time= 3.9s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.928 total time= 4.0s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.933 total time= 10.4s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.933 total time= 9.3s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.934 total time= 9.2s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.933 total time= 9.3s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.933 total time= 9.2s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.934 total time= 9.2s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.932 total time= 9.6s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.932 total time= 9.9s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.934 total time= 9.5s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.932 total time= 9.6s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.932 total time= 9.4s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.934 total time= 9.5s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.931 total time= 9.6s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.932 total time= 9.6s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.933 total time= 9.7s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.931 total time= 9.6s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.932 total time= 9.6s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.933 total time= 9.5s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.934 total time= 2.5s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.935 total time= 2.5s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.936 total time= 2.5s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.934 total time= 2.5s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.935 total time= 2.7s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.936 total time= 2.5s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.934 total time= 2.6s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.935 total time= 2.7s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.936 total time= 2.6s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.934 total time= 3.2s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.935 total time= 2.7s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.936 total time= 2.8s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.934 total time= 3.2s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.934 total time= 2.6s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.935 total time= 2.6s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.934 total time= 2.5s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.935 total time= 2.5s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.936 total time= 2.6s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.937 total time= 3.7s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.937 total time= 3.7s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.938 total time= 4.3s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.937 total time= 3.7s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.938 total time= 3.7s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.939 total time= 3.8s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.936 total time= 3.7s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.937 total time= 3.7s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.938 total time= 3.7s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.937 total time= 3.7s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.938 total time= 3.8s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.938 total time= 3.7s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.936 total time= 3.7s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.937 total time= 3.7s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.938 total time= 3.6s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.936 total time= 3.7s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.937 total time= 3.7s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.938 total time= 3.7s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.939 total time= 8.2s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.940 total time= 9.2s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.941 total time= 9.9s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.940 total time= 8.0s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.941 total time= 7.8s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.942 total time= 9.7s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.938 total time= 8.2s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.939 total time= 8.1s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.940 total time= 8.3s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.939 total time= 8.1s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.940 total time= 8.0s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.941 total time= 8.0s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.938 total time= 8.2s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.939 total time= 9.4s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.939 total time= 8.3s [CV 1/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.939 total time= 8.1s [CV 2/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.940 total time= 8.0s [CV 3/3] END colsample_bytree=0.5, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.940 total time= 8.2s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.923 total time= 2.7s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.924 total time= 2.6s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.925 total time= 2.6s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.922 total time= 2.6s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.923 total time= 2.6s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.925 total time= 2.6s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.922 total time= 2.8s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.923 total time= 3.0s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.924 total time= 2.7s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.921 total time= 2.8s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.922 total time= 2.7s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.923 total time= 2.7s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.921 total time= 2.7s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.921 total time= 2.7s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.923 total time= 2.7s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.920 total time= 2.7s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.921 total time= 2.7s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.922 total time= 2.6s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.926 total time= 4.1s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.927 total time= 4.3s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.928 total time= 3.8s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.925 total time= 3.9s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.926 total time= 3.8s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.927 total time= 3.8s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.924 total time= 3.9s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.925 total time= 3.9s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.926 total time= 4.0s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.924 total time= 3.9s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.925 total time= 3.9s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.926 total time= 3.9s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.923 total time= 3.9s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.924 total time= 4.0s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.926 total time= 3.9s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.923 total time= 3.9s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.924 total time= 4.7s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.925 total time= 4.1s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.932 total time= 9.2s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.933 total time= 9.0s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.934 total time= 9.0s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.932 total time= 9.0s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.933 total time= 8.9s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.934 total time= 9.3s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.931 total time= 9.3s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.932 total time= 9.3s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.933 total time= 9.9s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.931 total time= 9.2s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.932 total time= 9.3s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.933 total time= 9.4s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.931 total time= 9.6s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.932 total time= 9.5s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.933 total time= 10.1s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.931 total time= 10.2s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.932 total time= 10.2s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.01, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.933 total time= 9.3s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.934 total time= 2.5s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.935 total time= 2.5s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.5;, score=0.936 total time= 2.5s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.934 total time= 2.5s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.935 total time= 2.4s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=1, subsample=0.8;, score=0.936 total time= 2.5s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.934 total time= 2.5s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.935 total time= 2.5s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.5;, score=0.936 total time= 2.5s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.934 total time= 2.5s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.935 total time= 2.5s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=5, subsample=0.8;, score=0.936 total time= 2.5s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.934 total time= 2.5s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.935 total time= 2.6s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.5;, score=0.936 total time= 2.5s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.934 total time= 2.6s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.935 total time= 2.5s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=50, scale_pos_weight=10, subsample=0.8;, score=0.936 total time= 2.5s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.937 total time= 3.6s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.938 total time= 3.7s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.5;, score=0.939 total time= 3.6s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.937 total time= 3.6s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.938 total time= 3.9s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=1, subsample=0.8;, score=0.939 total time= 3.7s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.936 total time= 3.6s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.937 total time= 3.6s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.5;, score=0.938 total time= 3.6s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.937 total time= 3.6s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.938 total time= 3.7s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=5, subsample=0.8;, score=0.939 total time= 3.6s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.936 total time= 3.6s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.937 total time= 3.6s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.5;, score=0.938 total time= 3.6s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.937 total time= 3.6s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.937 total time= 3.6s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=100, scale_pos_weight=10, subsample=0.8;, score=0.938 total time= 3.6s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.939 total time= 7.9s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.940 total time= 8.4s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.5;, score=0.941 total time= 8.0s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.940 total time= 7.8s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.941 total time= 7.7s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=1, subsample=0.8;, score=0.942 total time= 7.8s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.938 total time= 8.0s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.940 total time= 7.9s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.5;, score=0.940 total time= 8.0s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.940 total time= 8.7s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.940 total time= 7.8s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=5, subsample=0.8;, score=0.941 total time= 7.8s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.938 total time= 8.1s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.939 total time= 8.5s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.5;, score=0.940 total time= 8.0s [CV 1/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.939 total time= 7.9s [CV 2/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.940 total time= 7.7s [CV 3/3] END colsample_bytree=1.0, learning_rate=0.1, n_estimators=300, scale_pos_weight=10, subsample=0.8;, score=0.941 total time= 7.7s
Out[50]:
GridSearchCV(cv=3,
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None, colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None, device=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
feature_types=None, gamma=None,
grow_policy=None, importance_type=None,
interaction_constraints=None,
learning_rate=None,...
max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None,
n_jobs=None, num_parallel_tree=None,
random_state=None, ...),
param_grid={'colsample_bytree': [0.5, 1.0],
'learning_rate': [0.01, 0.1],
'n_estimators': [50, 100, 300],
'scale_pos_weight': [1, 5, 10],
'subsample': [0.5, 0.8]},
scoring='roc_auc', verbose=3)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=3,
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None, colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None, device=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
feature_types=None, gamma=None,
grow_policy=None, importance_type=None,
interaction_constraints=None,
learning_rate=None,...
max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None,
n_jobs=None, num_parallel_tree=None,
random_state=None, ...),
param_grid={'colsample_bytree': [0.5, 1.0],
'learning_rate': [0.01, 0.1],
'n_estimators': [50, 100, 300],
'scale_pos_weight': [1, 5, 10],
'subsample': [0.5, 0.8]},
scoring='roc_auc', verbose=3)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)In [52]:
from sklearn.metrics import roc_auc_score
import pandas as pd
from xgboost import XGBClassifier
# Initialize an empty DataFrame to store results
results_df_xgb = pd.DataFrame(columns=['n_estimators', 'learning_rate', 'Subsample %', 'Features',
'% Weight of Default', 'AUC Train', 'AUC Test 1',
'AUC Test 2'])
# Iterate over each combination of settings
for i, params in enumerate(grid_search_xgb.cv_results_['params']):
# Initialize the model with current parameters
model = XGBClassifier(use_label_encoder=False, seed=4, **params)
# Fit the model with the current parameter combination
model.fit(X_train, y_train) # Ensure to include eval_metric to avoid warnings
# Predict probabilities for each set
train_pred = model.predict_proba(X_train)[:, 1]
test1_pred = model.predict_proba(X_test1)[:, 1]
test2_pred = model.predict_proba(X_test2)[:, 1]
# Calculate AUC for each set
auc_train = roc_auc_score(y_train, train_pred)
auc_test1 = roc_auc_score(y_test1, test1_pred)
auc_test2 = roc_auc_score(y_test2, test2_pred)
# Create a temporary DataFrame with the results
temp_df = pd.DataFrame({
'n_estimators': [params.get('n_estimators', 'Not specified')],
'learning_rate': [params.get('learning_rate', 'Not specified')],
'Subsample %': [f"{params.get('subsample', 0) * 100}%"],
'Features': [f"{params.get('colsample_bytree', 0) * 100}%"],
'% Weight of Default': [params.get('scale_pos_weight', 'Not specified')],
'AUC Train': [auc_train],
'AUC Test 1': [auc_test1],
'AUC Test 2': [auc_test2]
})
# Append the temporary DataFrame to the main results DataFrame
results_df_xgb = pd.concat([results_df_xgb, temp_df], ignore_index=True)
# # Optionally, save the results to a CSV file
results_df_xgb.to_csv('grid_search_results_xgb.csv', index=False)
# Display the first few rows of the results DataFrame
results_df_xgb.head()
/tmp/ipykernel_9594/2378702306.py:42: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.
Out[52]:
| n_estimators | learning_rate | Subsample % | Features | % Weight of Default | AUC Train | AUC Test 1 | AUC Test 2 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 50 | 0.01 | 50.0% | 50.0% | 1 | 0.928372 | 0.928303 | 0.926287 |
| 1 | 50 | 0.01 | 80.0% | 50.0% | 1 | 0.928376 | 0.928324 | 0.926273 |
| 2 | 50 | 0.01 | 50.0% | 50.0% | 5 | 0.926768 | 0.926775 | 0.925061 |
| 3 | 50 | 0.01 | 80.0% | 50.0% | 5 | 0.926663 | 0.926621 | 0.924905 |
| 4 | 50 | 0.01 | 50.0% | 50.0% | 10 | 0.926124 | 0.925888 | 0.924271 |
In [53]:
results_df_xgb
Out[53]:
| n_estimators | learning_rate | Subsample % | Features | % Weight of Default | AUC Train | AUC Test 1 | AUC Test 2 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 50 | 0.01 | 50.0% | 50.0% | 1 | 0.928372 | 0.928303 | 0.926287 |
| 1 | 50 | 0.01 | 80.0% | 50.0% | 1 | 0.928376 | 0.928324 | 0.926273 |
| 2 | 50 | 0.01 | 50.0% | 50.0% | 5 | 0.926768 | 0.926775 | 0.925061 |
| 3 | 50 | 0.01 | 80.0% | 50.0% | 5 | 0.926663 | 0.926621 | 0.924905 |
| 4 | 50 | 0.01 | 50.0% | 50.0% | 10 | 0.926124 | 0.925888 | 0.924271 |
| 5 | 50 | 0.01 | 80.0% | 50.0% | 10 | 0.926137 | 0.925835 | 0.924257 |
| 6 | 100 | 0.01 | 50.0% | 50.0% | 1 | 0.930257 | 0.930116 | 0.928110 |
| 7 | 100 | 0.01 | 80.0% | 50.0% | 1 | 0.930245 | 0.930050 | 0.928078 |
| 8 | 100 | 0.01 | 50.0% | 50.0% | 5 | 0.928713 | 0.928592 | 0.926844 |
| 9 | 100 | 0.01 | 80.0% | 50.0% | 5 | 0.928743 | 0.928570 | 0.926841 |
| 10 | 100 | 0.01 | 50.0% | 50.0% | 10 | 0.928194 | 0.927968 | 0.926264 |
| 11 | 100 | 0.01 | 80.0% | 50.0% | 10 | 0.928240 | 0.927970 | 0.926294 |
| 12 | 300 | 0.01 | 50.0% | 50.0% | 1 | 0.935255 | 0.934829 | 0.932786 |
| 13 | 300 | 0.01 | 80.0% | 50.0% | 1 | 0.935363 | 0.934800 | 0.932798 |
| 14 | 300 | 0.01 | 50.0% | 50.0% | 5 | 0.934256 | 0.933868 | 0.931979 |
| 15 | 300 | 0.01 | 80.0% | 50.0% | 5 | 0.934359 | 0.933898 | 0.931973 |
| 16 | 300 | 0.01 | 50.0% | 50.0% | 10 | 0.933984 | 0.933526 | 0.931654 |
| 17 | 300 | 0.01 | 80.0% | 50.0% | 10 | 0.934116 | 0.933613 | 0.931757 |
| 18 | 50 | 0.10 | 50.0% | 50.0% | 1 | 0.937436 | 0.936724 | 0.934535 |
| 19 | 50 | 0.10 | 80.0% | 50.0% | 1 | 0.937710 | 0.936669 | 0.934671 |
| 20 | 50 | 0.10 | 50.0% | 50.0% | 5 | 0.937022 | 0.936154 | 0.934143 |
| 21 | 50 | 0.10 | 80.0% | 50.0% | 5 | 0.937327 | 0.936293 | 0.934288 |
| 22 | 50 | 0.10 | 50.0% | 50.0% | 10 | 0.936832 | 0.935928 | 0.934052 |
| 23 | 50 | 0.10 | 80.0% | 50.0% | 10 | 0.937251 | 0.936210 | 0.934328 |
| 24 | 100 | 0.10 | 50.0% | 50.0% | 1 | 0.941613 | 0.939421 | 0.937452 |
| 25 | 100 | 0.10 | 80.0% | 50.0% | 1 | 0.942032 | 0.939454 | 0.937744 |
| 26 | 100 | 0.10 | 50.0% | 50.0% | 5 | 0.941202 | 0.938948 | 0.936946 |
| 27 | 100 | 0.10 | 80.0% | 50.0% | 5 | 0.941761 | 0.939196 | 0.937284 |
| 28 | 100 | 0.10 | 50.0% | 50.0% | 10 | 0.940957 | 0.938643 | 0.936980 |
| 29 | 100 | 0.10 | 80.0% | 50.0% | 10 | 0.941552 | 0.938998 | 0.937176 |
| 30 | 300 | 0.10 | 50.0% | 50.0% | 1 | 0.950337 | 0.942353 | 0.940698 |
| 31 | 300 | 0.10 | 80.0% | 50.0% | 1 | 0.951250 | 0.943107 | 0.941669 |
| 32 | 300 | 0.10 | 50.0% | 50.0% | 5 | 0.949579 | 0.941566 | 0.940027 |
| 33 | 300 | 0.10 | 80.0% | 50.0% | 5 | 0.950630 | 0.942517 | 0.941033 |
| 34 | 300 | 0.10 | 50.0% | 50.0% | 10 | 0.948608 | 0.940792 | 0.939439 |
| 35 | 300 | 0.10 | 80.0% | 50.0% | 10 | 0.949791 | 0.941820 | 0.940312 |
| 36 | 50 | 0.01 | 50.0% | 100.0% | 1 | 0.925120 | 0.925311 | 0.922637 |
| 37 | 50 | 0.01 | 80.0% | 100.0% | 1 | 0.924699 | 0.924838 | 0.922121 |
| 38 | 50 | 0.01 | 50.0% | 100.0% | 5 | 0.923819 | 0.923977 | 0.921518 |
| 39 | 50 | 0.01 | 80.0% | 100.0% | 5 | 0.923333 | 0.923434 | 0.920992 |
| 40 | 50 | 0.01 | 50.0% | 100.0% | 10 | 0.922479 | 0.922612 | 0.919991 |
| 41 | 50 | 0.01 | 80.0% | 100.0% | 10 | 0.922311 | 0.922456 | 0.919779 |
| 42 | 100 | 0.01 | 50.0% | 100.0% | 1 | 0.928229 | 0.928273 | 0.925745 |
| 43 | 100 | 0.01 | 80.0% | 100.0% | 1 | 0.927909 | 0.927887 | 0.925345 |
| 44 | 100 | 0.01 | 50.0% | 100.0% | 5 | 0.926455 | 0.926499 | 0.924102 |
| 45 | 100 | 0.01 | 80.0% | 100.0% | 5 | 0.926160 | 0.926101 | 0.923743 |
| 46 | 100 | 0.01 | 50.0% | 100.0% | 10 | 0.925621 | 0.925564 | 0.923127 |
| 47 | 100 | 0.01 | 80.0% | 100.0% | 10 | 0.925468 | 0.925264 | 0.922913 |
| 48 | 300 | 0.01 | 50.0% | 100.0% | 1 | 0.935036 | 0.934519 | 0.932142 |
| 49 | 300 | 0.01 | 80.0% | 100.0% | 1 | 0.935049 | 0.934341 | 0.931995 |
| 50 | 300 | 0.01 | 50.0% | 100.0% | 5 | 0.934065 | 0.933575 | 0.931288 |
| 51 | 300 | 0.01 | 80.0% | 100.0% | 5 | 0.934097 | 0.933516 | 0.931220 |
| 52 | 300 | 0.01 | 50.0% | 100.0% | 10 | 0.933781 | 0.933239 | 0.930966 |
| 53 | 300 | 0.01 | 80.0% | 100.0% | 10 | 0.933785 | 0.933121 | 0.930876 |
| 54 | 50 | 0.10 | 50.0% | 100.0% | 1 | 0.937919 | 0.936702 | 0.934670 |
| 55 | 50 | 0.10 | 80.0% | 100.0% | 1 | 0.938281 | 0.936941 | 0.934706 |
| 56 | 50 | 0.10 | 50.0% | 100.0% | 5 | 0.937449 | 0.936355 | 0.934286 |
| 57 | 50 | 0.10 | 80.0% | 100.0% | 5 | 0.937841 | 0.936566 | 0.934483 |
| 58 | 50 | 0.10 | 50.0% | 100.0% | 10 | 0.937397 | 0.936300 | 0.934179 |
| 59 | 50 | 0.10 | 80.0% | 100.0% | 10 | 0.937579 | 0.936349 | 0.934352 |
| 60 | 100 | 0.10 | 50.0% | 100.0% | 1 | 0.942236 | 0.939463 | 0.937600 |
| 61 | 100 | 0.10 | 80.0% | 100.0% | 1 | 0.942833 | 0.939768 | 0.937917 |
| 62 | 100 | 0.10 | 50.0% | 100.0% | 5 | 0.941946 | 0.939199 | 0.937312 |
| 63 | 100 | 0.10 | 80.0% | 100.0% | 5 | 0.942456 | 0.939453 | 0.937724 |
| 64 | 100 | 0.10 | 50.0% | 100.0% | 10 | 0.941646 | 0.938857 | 0.936975 |
| 65 | 100 | 0.10 | 80.0% | 100.0% | 10 | 0.942023 | 0.939201 | 0.937381 |
| 66 | 300 | 0.10 | 50.0% | 100.0% | 1 | 0.951841 | 0.942651 | 0.941195 |
| 67 | 300 | 0.10 | 80.0% | 100.0% | 1 | 0.952678 | 0.943455 | 0.941911 |
| 68 | 300 | 0.10 | 50.0% | 100.0% | 5 | 0.951079 | 0.941968 | 0.940384 |
| 69 | 300 | 0.10 | 80.0% | 100.0% | 5 | 0.951965 | 0.942892 | 0.941367 |
| 70 | 300 | 0.10 | 50.0% | 100.0% | 10 | 0.949800 | 0.941164 | 0.939527 |
| 71 | 300 | 0.10 | 80.0% | 100.0% | 10 | 0.951010 | 0.942212 | 0.940624 |
In [54]:
# calculate the mean auc
results_df_xgb['Average AUC'] = results_df_xgb[['AUC Train', 'AUC Test 1', 'AUC Test 2']].mean(axis=1)
# calculate
results_df_xgb['Std AUC'] = results_df_xgb[['AUC Train', 'AUC Test 1', 'AUC Test 2']].std(axis=1)
results_df_xgb
# results_df_xgb.iloc[:, 5:]
Out[54]:
| n_estimators | learning_rate | Subsample % | Features | % Weight of Default | AUC Train | AUC Test 1 | AUC Test 2 | Average AUC | Std AUC | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 50 | 0.01 | 50.0% | 50.0% | 1 | 0.928372 | 0.928303 | 0.926287 | 0.927654 | 0.001184 |
| 1 | 50 | 0.01 | 80.0% | 50.0% | 1 | 0.928376 | 0.928324 | 0.926273 | 0.927658 | 0.001200 |
| 2 | 50 | 0.01 | 50.0% | 50.0% | 5 | 0.926768 | 0.926775 | 0.925061 | 0.926201 | 0.000988 |
| 3 | 50 | 0.01 | 80.0% | 50.0% | 5 | 0.926663 | 0.926621 | 0.924905 | 0.926063 | 0.001003 |
| 4 | 50 | 0.01 | 50.0% | 50.0% | 10 | 0.926124 | 0.925888 | 0.924271 | 0.925428 | 0.001009 |
| 5 | 50 | 0.01 | 80.0% | 50.0% | 10 | 0.926137 | 0.925835 | 0.924257 | 0.925410 | 0.001010 |
| 6 | 100 | 0.01 | 50.0% | 50.0% | 1 | 0.930257 | 0.930116 | 0.928110 | 0.929495 | 0.001201 |
| 7 | 100 | 0.01 | 80.0% | 50.0% | 1 | 0.930245 | 0.930050 | 0.928078 | 0.929458 | 0.001199 |
| 8 | 100 | 0.01 | 50.0% | 50.0% | 5 | 0.928713 | 0.928592 | 0.926844 | 0.928050 | 0.001046 |
| 9 | 100 | 0.01 | 80.0% | 50.0% | 5 | 0.928743 | 0.928570 | 0.926841 | 0.928051 | 0.001051 |
| 10 | 100 | 0.01 | 50.0% | 50.0% | 10 | 0.928194 | 0.927968 | 0.926264 | 0.927476 | 0.001055 |
| 11 | 100 | 0.01 | 80.0% | 50.0% | 10 | 0.928240 | 0.927970 | 0.926294 | 0.927501 | 0.001054 |
| 12 | 300 | 0.01 | 50.0% | 50.0% | 1 | 0.935255 | 0.934829 | 0.932786 | 0.934290 | 0.001320 |
| 13 | 300 | 0.01 | 80.0% | 50.0% | 1 | 0.935363 | 0.934800 | 0.932798 | 0.934320 | 0.001348 |
| 14 | 300 | 0.01 | 50.0% | 50.0% | 5 | 0.934256 | 0.933868 | 0.931979 | 0.933367 | 0.001218 |
| 15 | 300 | 0.01 | 80.0% | 50.0% | 5 | 0.934359 | 0.933898 | 0.931973 | 0.933410 | 0.001266 |
| 16 | 300 | 0.01 | 50.0% | 50.0% | 10 | 0.933984 | 0.933526 | 0.931654 | 0.933054 | 0.001235 |
| 17 | 300 | 0.01 | 80.0% | 50.0% | 10 | 0.934116 | 0.933613 | 0.931757 | 0.933162 | 0.001243 |
| 18 | 50 | 0.10 | 50.0% | 50.0% | 1 | 0.937436 | 0.936724 | 0.934535 | 0.936232 | 0.001512 |
| 19 | 50 | 0.10 | 80.0% | 50.0% | 1 | 0.937710 | 0.936669 | 0.934671 | 0.936350 | 0.001545 |
| 20 | 50 | 0.10 | 50.0% | 50.0% | 5 | 0.937022 | 0.936154 | 0.934143 | 0.935773 | 0.001477 |
| 21 | 50 | 0.10 | 80.0% | 50.0% | 5 | 0.937327 | 0.936293 | 0.934288 | 0.935969 | 0.001545 |
| 22 | 50 | 0.10 | 50.0% | 50.0% | 10 | 0.936832 | 0.935928 | 0.934052 | 0.935604 | 0.001418 |
| 23 | 50 | 0.10 | 80.0% | 50.0% | 10 | 0.937251 | 0.936210 | 0.934328 | 0.935930 | 0.001482 |
| 24 | 100 | 0.10 | 50.0% | 50.0% | 1 | 0.941613 | 0.939421 | 0.937452 | 0.939495 | 0.002082 |
| 25 | 100 | 0.10 | 80.0% | 50.0% | 1 | 0.942032 | 0.939454 | 0.937744 | 0.939743 | 0.002159 |
| 26 | 100 | 0.10 | 50.0% | 50.0% | 5 | 0.941202 | 0.938948 | 0.936946 | 0.939032 | 0.002129 |
| 27 | 100 | 0.10 | 80.0% | 50.0% | 5 | 0.941761 | 0.939196 | 0.937284 | 0.939414 | 0.002246 |
| 28 | 100 | 0.10 | 50.0% | 50.0% | 10 | 0.940957 | 0.938643 | 0.936980 | 0.938860 | 0.001997 |
| 29 | 100 | 0.10 | 80.0% | 50.0% | 10 | 0.941552 | 0.938998 | 0.937176 | 0.939242 | 0.002198 |
| 30 | 300 | 0.10 | 50.0% | 50.0% | 1 | 0.950337 | 0.942353 | 0.940698 | 0.944462 | 0.005154 |
| 31 | 300 | 0.10 | 80.0% | 50.0% | 1 | 0.951250 | 0.943107 | 0.941669 | 0.945342 | 0.005167 |
| 32 | 300 | 0.10 | 50.0% | 50.0% | 5 | 0.949579 | 0.941566 | 0.940027 | 0.943724 | 0.005128 |
| 33 | 300 | 0.10 | 80.0% | 50.0% | 5 | 0.950630 | 0.942517 | 0.941033 | 0.944727 | 0.005166 |
| 34 | 300 | 0.10 | 50.0% | 50.0% | 10 | 0.948608 | 0.940792 | 0.939439 | 0.942946 | 0.004949 |
| 35 | 300 | 0.10 | 80.0% | 50.0% | 10 | 0.949791 | 0.941820 | 0.940312 | 0.943974 | 0.005093 |
| 36 | 50 | 0.01 | 50.0% | 100.0% | 1 | 0.925120 | 0.925311 | 0.922637 | 0.924356 | 0.001492 |
| 37 | 50 | 0.01 | 80.0% | 100.0% | 1 | 0.924699 | 0.924838 | 0.922121 | 0.923886 | 0.001530 |
| 38 | 50 | 0.01 | 50.0% | 100.0% | 5 | 0.923819 | 0.923977 | 0.921518 | 0.923105 | 0.001376 |
| 39 | 50 | 0.01 | 80.0% | 100.0% | 5 | 0.923333 | 0.923434 | 0.920992 | 0.922586 | 0.001381 |
| 40 | 50 | 0.01 | 50.0% | 100.0% | 10 | 0.922479 | 0.922612 | 0.919991 | 0.921694 | 0.001476 |
| 41 | 50 | 0.01 | 80.0% | 100.0% | 10 | 0.922311 | 0.922456 | 0.919779 | 0.921515 | 0.001505 |
| 42 | 100 | 0.01 | 50.0% | 100.0% | 1 | 0.928229 | 0.928273 | 0.925745 | 0.927416 | 0.001447 |
| 43 | 100 | 0.01 | 80.0% | 100.0% | 1 | 0.927909 | 0.927887 | 0.925345 | 0.927047 | 0.001474 |
| 44 | 100 | 0.01 | 50.0% | 100.0% | 5 | 0.926455 | 0.926499 | 0.924102 | 0.925685 | 0.001371 |
| 45 | 100 | 0.01 | 80.0% | 100.0% | 5 | 0.926160 | 0.926101 | 0.923743 | 0.925334 | 0.001379 |
| 46 | 100 | 0.01 | 50.0% | 100.0% | 10 | 0.925621 | 0.925564 | 0.923127 | 0.924771 | 0.001424 |
| 47 | 100 | 0.01 | 80.0% | 100.0% | 10 | 0.925468 | 0.925264 | 0.922913 | 0.924548 | 0.001420 |
| 48 | 300 | 0.01 | 50.0% | 100.0% | 1 | 0.935036 | 0.934519 | 0.932142 | 0.933899 | 0.001543 |
| 49 | 300 | 0.01 | 80.0% | 100.0% | 1 | 0.935049 | 0.934341 | 0.931995 | 0.933795 | 0.001599 |
| 50 | 300 | 0.01 | 50.0% | 100.0% | 5 | 0.934065 | 0.933575 | 0.931288 | 0.932976 | 0.001482 |
| 51 | 300 | 0.01 | 80.0% | 100.0% | 5 | 0.934097 | 0.933516 | 0.931220 | 0.932944 | 0.001521 |
| 52 | 300 | 0.01 | 50.0% | 100.0% | 10 | 0.933781 | 0.933239 | 0.930966 | 0.932662 | 0.001494 |
| 53 | 300 | 0.01 | 80.0% | 100.0% | 10 | 0.933785 | 0.933121 | 0.930876 | 0.932594 | 0.001525 |
| 54 | 50 | 0.10 | 50.0% | 100.0% | 1 | 0.937919 | 0.936702 | 0.934670 | 0.936431 | 0.001642 |
| 55 | 50 | 0.10 | 80.0% | 100.0% | 1 | 0.938281 | 0.936941 | 0.934706 | 0.936643 | 0.001806 |
| 56 | 50 | 0.10 | 50.0% | 100.0% | 5 | 0.937449 | 0.936355 | 0.934286 | 0.936030 | 0.001607 |
| 57 | 50 | 0.10 | 80.0% | 100.0% | 5 | 0.937841 | 0.936566 | 0.934483 | 0.936296 | 0.001695 |
| 58 | 50 | 0.10 | 50.0% | 100.0% | 10 | 0.937397 | 0.936300 | 0.934179 | 0.935959 | 0.001636 |
| 59 | 50 | 0.10 | 80.0% | 100.0% | 10 | 0.937579 | 0.936349 | 0.934352 | 0.936093 | 0.001629 |
| 60 | 100 | 0.10 | 50.0% | 100.0% | 1 | 0.942236 | 0.939463 | 0.937600 | 0.939766 | 0.002333 |
| 61 | 100 | 0.10 | 80.0% | 100.0% | 1 | 0.942833 | 0.939768 | 0.937917 | 0.940173 | 0.002483 |
| 62 | 100 | 0.10 | 50.0% | 100.0% | 5 | 0.941946 | 0.939199 | 0.937312 | 0.939486 | 0.002330 |
| 63 | 100 | 0.10 | 80.0% | 100.0% | 5 | 0.942456 | 0.939453 | 0.937724 | 0.939878 | 0.002394 |
| 64 | 100 | 0.10 | 50.0% | 100.0% | 10 | 0.941646 | 0.938857 | 0.936975 | 0.939159 | 0.002350 |
| 65 | 100 | 0.10 | 80.0% | 100.0% | 10 | 0.942023 | 0.939201 | 0.937381 | 0.939535 | 0.002339 |
| 66 | 300 | 0.10 | 50.0% | 100.0% | 1 | 0.951841 | 0.942651 | 0.941195 | 0.945229 | 0.005772 |
| 67 | 300 | 0.10 | 80.0% | 100.0% | 1 | 0.952678 | 0.943455 | 0.941911 | 0.946015 | 0.005822 |
| 68 | 300 | 0.10 | 50.0% | 100.0% | 5 | 0.951079 | 0.941968 | 0.940384 | 0.944477 | 0.005772 |
| 69 | 300 | 0.10 | 80.0% | 100.0% | 5 | 0.951965 | 0.942892 | 0.941367 | 0.945408 | 0.005730 |
| 70 | 300 | 0.10 | 50.0% | 100.0% | 10 | 0.949800 | 0.941164 | 0.939527 | 0.943497 | 0.005520 |
| 71 | 300 | 0.10 | 80.0% | 100.0% | 10 | 0.951010 | 0.942212 | 0.940624 | 0.944615 | 0.005595 |
In [55]:
plt.figure(figsize=(10, 5))
plt.scatter(results_df_xgb['Average AUC'], results_df_xgb['Std AUC'])
xlab = 'Average AUC Scores'
ylab = 'Standard Deviation of AUC Scores'
title = 'XGBoost Model Average AUC vs Std AUC'
plt.xlabel(xlab)
plt.ylabel(ylab)
plt.title(title)
plt.show()
In [56]:
plt.figure(figsize=(10, 5))
plt.scatter(results_df_xgb['AUC Train'], results_df_xgb['AUC Test 2'])
xlab = 'AUC of Train sample'
ylab = 'AUC of Test 2'
title = 'Train AUC vs Test_2 AUC'
plt.xlabel(xlab)
plt.ylabel(ylab)
plt.title(title)
plt.show()
Best XGB Model Based on Bias and Variance¶
In [57]:
# Calculate the absolute difference between 'AUC Train' and 'AUC Test 1'
results_df_xgb['AUC Diff'] = abs(results_df_xgb['AUC Train'] - results_df_xgb['AUC Test 1'])
# Find the minimum difference to identify the models with the closest train and test performance
min_diff = results_df_xgb['AUC Diff'].min()
# Filter the DataFrame to rows that match the minimum difference
min_diff_rows = results_df_xgb[results_df_xgb['AUC Diff'] == min_diff]
# From those rows, find the one with the highest 'AUC Test 2'
highest_test_auc_index = min_diff_rows['AUC Test 2'].idxmax()
# Select the best row based on the criteria
best_model_xgb = results_df_xgb.loc[highest_test_auc_index]
print("Row with the highest 'AUC Test 2' and smallest difference between 'AUC Train' and 'AUC Test 2':")
print(best_model_xgb)
Row with the highest 'AUC Test 2' and smallest difference between 'AUC Train' and 'AUC Test 2': n_estimators 50 learning_rate 0.01 Subsample % 50.0% Features 50.0% % Weight of Default 5 AUC Train 0.926768 AUC Test 1 0.926775 AUC Test 2 0.925061 Average AUC 0.926201 Std AUC 0.000988 AUC Diff 0.000007 Name: 2, dtype: object
In [58]:
best_xgb_model_params = {
'n_estimators': best_model_xgb['n_estimators'],
'learning_rate': best_model_xgb['learning_rate'],
'subsample': float(best_model_xgb['Subsample %'][:-1]) / 100.0,
'colsample_bytree': float(best_model_xgb['Features'][:-1]) / 100.0,
'scale_pos_weight': best_model_xgb['% Weight of Default']
}
In [59]:
xgb_final = XGBClassifier(**best_xgb_model_params)
xgb_final.fit(X_train, y_train)
Out[59]:
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=0.5, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.01, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=50, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=0.5, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.01, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=50, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)In [60]:
print(xgb_final.get_params())
{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.5, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.01, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 50, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': 5, 'subsample': 0.5, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}
In [61]:
# Get feature importances
feature_importances_3 = xgb_final.feature_importances_
# Print feature importance for features with importance > 0.5%
print("Features with importance higher than 0.5%:")
for feature_name, importance in zip(X_train.columns, feature_importances_3):
if importance > 0.005:
print(f'Feature: {feature_name}, Importance: {importance}')
Features with importance higher than 0.5%: Feature: D_52, Importance: 0.01048071775585413 Feature: B_2, Importance: 0.013965119607746601 Feature: S_7, Importance: 0.00853777676820755 Feature: B_7, Importance: 0.16809943318367004 Feature: P_2, Importance: 0.19531044363975525 Feature: B_3, Importance: 0.02020329236984253 Feature: D_45, Importance: 0.019460203126072884 Feature: D_44, Importance: 0.03430410474538803 Feature: B_1, Importance: 0.05245744436979294 Feature: B_37, Importance: 0.05257605016231537 Feature: P_Total, Importance: 0.011594913899898529 Feature: B_Total, Importance: 0.006424103397876024 Feature: D_75, Importance: 0.03697102516889572 Feature: R_27, Importance: 0.005594146903604269 Feature: B_9, Importance: 0.03343643620610237 Feature: D_48, Importance: 0.21153990924358368 Feature: B_10, Importance: 0.03539016842842102 Feature: B_Ave, Importance: 0.008054395206272602 Feature: S_3, Importance: 0.007965869270265102 Feature: D_42, Importance: 0.016912920400500298
In [62]:
# Plotting Top 10 features
from xgboost import plot_importance
plt.figure(figsize=(15, 20))
plot_importance(xgb_final, max_num_features=20)
plt.title('Top 20 Feature Importance')
plt.show()
<Figure size 1500x2000 with 0 Axes>
Model Performance Functions¶
In [63]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
def model_performance(model_name,
X_train_data,
y_train_data,
X_test_data1,
y_test_data1,
X_test_data2,
y_test_data2):
train_preds = model_name.predict(X_train_data)
test_preds1= model_name.predict(X_test_data1)
test_preds2 = model_name.predict(X_test_data2)
a = classification_report(y_train_data, train_preds)
b = classification_report(y_test_data1, test_preds1)
e = classification_report(y_test_data2, test_preds2)
c = round(model_name.score(X_train_data, y_train_data),4)
d = round(model_name.score(X_test_data1, y_test_data1),4)
f = round(model_name.score(X_test_data2, y_test_data2),4)
print('classification report for training data')
print(a)
print('\n')
print('classification report for testing data 1')
print(b)
print('\n')
print('classification report for testing data 2')
print(e)
print('\n')
print('the model score for training data is ',c)
print('the model score for testing data 1 is ',d)
print('the model score for testing data 2 is ',f)
print('\n')
from seaborn import heatmap
fig, axes = plt.subplots(ncols=3)
fig.set_size_inches(20,5)
plt.subplot(1,3,1)
sns.heatmap(confusion_matrix(y_train_data, train_preds), annot=True, cbar=False, fmt='d')
plt.xlabel('predicted labels')
plt.ylabel('actual labels')
plt.title('Confusion Matrix for Training Data')
plt.subplot(1,3,2)
sns.heatmap(confusion_matrix(y_test_data1, test_preds1), annot=True, cbar=False, fmt='d')
plt.xlabel('predicted labels')
plt.ylabel('actual labels')
plt.title('Confusion Matrix for Testing Data 1')
plt.subplot(1,3,3)
sns.heatmap(confusion_matrix(y_test_data2, test_preds2), annot=True, cbar=False, fmt='d')
plt.xlabel('predicted labels')
plt.ylabel('actual labels')
plt.title('Confusion Matrix for Testing Data 2')
In [64]:
# ROC AUC Plots
def roc_score_auc_curve(model_name,
X_train_data,
y_train_data,
X_test_data1,
y_test_data1,
X_test_data2,
y_test_data2):
train_preds = model_name.predict(X_train_data)
test_preds1= model_name.predict(X_test_data1)
a = round(roc_auc_score(y_train_data, model_name.predict_proba(X_train_data)[:,1]),4)
b = round(roc_auc_score(y_test_data1, model_name.predict_proba(X_test_data1)[:,1]),4)
c = round(roc_auc_score(y_test_data2, model_name.predict_proba(X_test_data2)[:,1]),4)
print('AUC Score for Model on Training Data is',a)
print('AUC Score for Model on Testing Data 1 is',b)
print('AUC Score for Model on Testing Data 2 is',c)
plt.figure(figsize=(12,7))
train_fpr, train_tpr, train_thresholds = roc_curve(y_train_data, model_name.predict_proba(X_train_data)[:,1])
plt.plot([0,1],[0,1], linestyle='--', color='g')
plt.plot(train_fpr, train_tpr, marker='.')
test_fpr, test_tpr, test_thresholds = roc_curve(y_test_data1, model_name.predict_proba(X_test_data1)[:,1])
plt.plot([0,1],[0,1], linestyle='--', color='g')
plt.plot(test_fpr, test_tpr, marker='o')
test_fpr2, test_tpr2, test_thresholds2 = roc_curve(y_test_data2, model_name.predict_proba(X_test_data2)[:,1])
plt.plot([0,1],[0,1], linestyle='--', color='g')
plt.plot(test_fpr2, test_tpr2, marker='o')
Final XGBoost Model's Performance¶
In [65]:
model_performance(model_name=xgb_final,
X_train_data=X_train,
y_train_data=y_train,
X_test_data1=X_test1,
y_test_data1=y_test1,
X_test_data2=X_test2,
y_test_data2=y_test2)
classification report for training data
precision recall f1-score support
0 0.99 0.67 0.80 403583
1 0.49 0.97 0.66 133850
accuracy 0.75 537433
macro avg 0.74 0.82 0.73 537433
weighted avg 0.86 0.75 0.76 537433
classification report for testing data 1
precision recall f1-score support
0 0.99 0.67 0.80 86376
1 0.49 0.97 0.66 28788
accuracy 0.75 115164
macro avg 0.74 0.82 0.73 115164
weighted avg 0.86 0.75 0.76 115164
classification report for testing data 2
precision recall f1-score support
0 0.99 0.67 0.80 86273
1 0.50 0.97 0.66 28892
accuracy 0.74 115165
macro avg 0.74 0.82 0.73 115165
weighted avg 0.86 0.74 0.76 115165
the model score for training data is 0.7454
the model score for testing data 1 is 0.7451
the model score for testing data 2 is 0.7443
In [66]:
roc_score_auc_curve(model_name=xgb_final,
X_train_data=X_train,
y_train_data=y_train,
X_test_data1=X_test1,
y_test_data1=y_test1,
X_test_data2=X_test2,
y_test_data2=y_test2)
AUC Score for Model on Training Data is 0.9268 AUC Score for Model on Testing Data 1 is 0.9267 AUC Score for Model on Testing Data 2 is 0.9252
In [67]:
def k_fold_cross_valscore(model_name,
x_train_data,
y_train_data,
folds
):
from sklearn.model_selection import cross_val_score
import pandas as pd
model_kfold_recall = cross_val_score(model_name, x_train_data, y_train_data,cv=folds, scoring='recall', verbose=0)
model_kfold_accuracy = cross_val_score(model_name, x_train_data, y_train_data,cv=folds, scoring='accuracy', verbose=0)
model_kfold_precision = cross_val_score(model_name, x_train_data, y_train_data,cv=folds, scoring='precision', verbose=0)
model_kfold_f1 = cross_val_score(model_name, x_train_data, y_train_data,cv=folds, scoring='f1', verbose=0)
a = pd.DataFrame(model_kfold_recall)
b = pd.DataFrame(model_kfold_accuracy)
c = pd.DataFrame(model_kfold_precision)
d = pd.DataFrame(model_kfold_f1)
cross_val_data = pd.concat([a,b,c,d], ignore_index=True, axis=1)
cross_val_data.rename(columns={0:"Recall", 1:"Accuracy", 2:"Precision",3:"F1 Score"},
inplace=True)
print('\n')
print("The mean recall for the model after " ,folds," folds is ", np.mean(model_kfold_recall))
print("The mean accuracy for model after ",folds," folds is", np.mean(model_kfold_accuracy))
print("the mean precision for the model after ",folds," folds is",np.mean(model_kfold_precision))
print("the mean f1 score for the model after ",folds," folds is", np.mean(model_kfold_f1))
print("\n")
return cross_val_data
In [68]:
# K- Fold Cross Validation for Training Data
k_fold_cross_valscore(model_name =xgb_final,
x_train_data = X_train,
y_train_data = y_train,
folds = 10)
The mean recall for the model after 10 folds is 0.9730070974971984 The mean accuracy for model after 10 folds is 0.7452761576856105 the mean precision for the model after 10 folds is 0.49422347482501905 the mean f1 score for the model after 10 folds is 0.6554961255020741
Out[68]:
| Recall | Accuracy | Precision | F1 Score | |
|---|---|---|---|---|
| 0 | 0.973104 | 0.743785 | 0.492718 | 0.654194 |
| 1 | 0.972507 | 0.745516 | 0.494454 | 0.655587 |
| 2 | 0.974673 | 0.745274 | 0.494223 | 0.655875 |
| 3 | 0.970564 | 0.744860 | 0.493785 | 0.654557 |
| 4 | 0.972731 | 0.743818 | 0.492753 | 0.654140 |
| 5 | 0.973552 | 0.745344 | 0.494291 | 0.655681 |
| 6 | 0.971236 | 0.744748 | 0.493677 | 0.654615 |
| 7 | 0.974001 | 0.745474 | 0.494425 | 0.655900 |
| 8 | 0.974972 | 0.750014 | 0.499044 | 0.660175 |
| 9 | 0.972731 | 0.743929 | 0.492864 | 0.654238 |
In [69]:
# K- Fold Cross Validation for Testing Data 1
k_fold_cross_valscore(model_name =xgb_final,
x_train_data = X_test1,
y_train_data = y_test1,
folds = 10)
The mean recall for the model after 10 folds is 0.973877888358367 The mean accuracy for model after 10 folds is 0.7497133795872017 the mean precision for the model after 10 folds is 0.49971810702330277 the mean f1 score for the model after 10 folds is 0.6604991904791444
Out[69]:
| Recall | Accuracy | Precision | F1 Score | |
|---|---|---|---|---|
| 0 | 0.971171 | 0.756187 | 0.506430 | 0.665714 |
| 1 | 0.974644 | 0.753234 | 0.503318 | 0.663828 |
| 2 | 0.977075 | 0.750369 | 0.500356 | 0.661804 |
| 3 | 0.979507 | 0.747417 | 0.497354 | 0.659726 |
| 4 | 0.967338 | 0.752345 | 0.502346 | 0.661283 |
| 5 | 0.977067 | 0.754342 | 0.504395 | 0.665326 |
| 6 | 0.974297 | 0.750782 | 0.500803 | 0.661557 |
| 7 | 0.967697 | 0.742185 | 0.492052 | 0.652383 |
| 8 | 0.974297 | 0.745832 | 0.495758 | 0.657140 |
| 9 | 0.975686 | 0.744443 | 0.494368 | 0.656232 |
In [70]:
# K- Fold Cross Validation for Testing Data 2
k_fold_cross_valscore(model_name =xgb_final,
x_train_data = X_test2,
y_train_data = y_test2,
folds = 10)
The mean recall for the model after 10 folds is 0.9737643441714846 The mean accuracy for model after 10 folds is 0.7449224839201262 the mean precision for the model after 10 folds is 0.4957565763441945 the mean f1 score for the model after 10 folds is 0.6570094538950928
Out[70]:
| Recall | Accuracy | Precision | F1 Score | |
|---|---|---|---|---|
| 0 | 0.976116 | 0.745854 | 0.496654 | 0.658340 |
| 1 | 0.973001 | 0.748025 | 0.498846 | 0.659550 |
| 2 | 0.975078 | 0.746375 | 0.497176 | 0.658562 |
| 3 | 0.972318 | 0.740644 | 0.491517 | 0.652957 |
| 4 | 0.975779 | 0.745854 | 0.496741 | 0.658340 |
| 5 | 0.968155 | 0.746092 | 0.496891 | 0.656727 |
| 6 | 0.975770 | 0.738625 | 0.489495 | 0.651943 |
| 7 | 0.973001 | 0.749218 | 0.500089 | 0.660635 |
| 8 | 0.976116 | 0.743314 | 0.494130 | 0.656119 |
| 9 | 0.972309 | 0.745224 | 0.496027 | 0.656922 |
Shap Analysis for XGBoost Model¶
In [110]:
import shap
# bee swarm plot
explainer = shap.Explainer(xgb_final)
shap_values = explainer(X_test2)
shap.plots.beeswarm(shap_values)
In [111]:
# waterfall plot
shap.plots.waterfall(shap_values[150])
Score Bins for Best XGBoost Model¶
In [73]:
# 1. Show the parameters of the final model
print("Parameters of the final model:", xgb_final.get_params())
print('\n')
# 2. Calculate and display AUC on each sample
y_train_pred = xgb_final.predict_proba(X_train)[:, 1]
y_test1_pred = xgb_final.predict_proba(X_test1)[:, 1]
y_test2_pred = xgb_final.predict_proba(X_test2)[:, 1]
auc_train = roc_auc_score(y_train, y_train_pred)
auc_test1 = roc_auc_score(y_test1, y_test1_pred)
auc_test2 = roc_auc_score(y_test2, y_test2_pred)
print(f"AUC on Train: {auc_train}")
print(f"AUC on Test 1: {auc_test1}")
print(f"AUC on Test 2: {auc_test2}")
# 3. Define score bins based on the train sample
train_scores = xgb_final.predict_proba(X_train)[:, 1]
bins = np.percentile(train_scores, [0, 25, 50, 75, 100])
# 4. Apply the same thresholds to test samples and calculate default rates
def calculate_default_rate(y_true, y_pred, bins):
indices = np.digitize(y_pred, bins) - 1 # Find bin index for each prediction
default_rate = [np.mean(y_true[indices == i]) for i in range(len(bins)-1)]
return default_rate
default_rates_train = calculate_default_rate(y_train, y_train_pred, bins)
default_rates_test1 = calculate_default_rate(y_test1, y_test1_pred, bins)
default_rates_test2 = calculate_default_rate(y_test2, y_test2_pred, bins)
color_train = 'red' # Red color
color_test1 = '#00CED1' # Turquoise color (HEX code)
color_test2 = 'purple' # Purple color
# 5. Show rank orderings in a Bar-Chart
width = 0.25 # Width of the bars
x_indexes = np.arange(len(bins)-1)
plt.figure(figsize=(12, 6))
plt.bar(x_indexes - width, default_rates_train, width=width, label='Train', color=color_train)
plt.bar(x_indexes, default_rates_test1, width=width, label='Test 1', color=color_test1)
plt.bar(x_indexes + width, default_rates_test2, width=width, label='Test 2', color=color_test2)
# Formatting the plot
plt.xlabel('Score Bins')
plt.ylabel('Default Rate')
plt.title('Rank Orderings by Score Bins')
plt.xticks(x_indexes, [f"{bins[i]:.2f}-{bins[i+1]:.2f}" for i in range(len(bins)-1)])
plt.legend()
plt.grid(True)
plt.show()
Parameters of the final model: {'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.5, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.01, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 50, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': 5, 'subsample': 0.5, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}
AUC on Train: 0.9267799025399948
AUC on Test 1: 0.9267157509578828
AUC on Test 2: 0.9252101702245537
In [ ]:
In [ ]:
Outlier Analysis¶
In [74]:
df4 = df3.copy()
In [ ]:
In [75]:
print(features)
['D_52', 'B_Ave_3_months', 'D_112', 'B_38_2.0', 'B_2', 'R_Ave', 'R_Ave_12_months', 'S_7', 'R_1', 'B_38_4.0', 'R_26', 'B_7', 'D_41', 'P_2', 'D_129', 'D_132', 'B_3', 'D_49', 'D_45', 'D_44', 'B_1', 'D_46', 'B_37', 'P_Total', 'B_Total', 'D_51', 'S_Ave_6_months', 'D_75', 'R_27', 'B_9', 'B_Ave_6_months', 'D_79', 'D_48', 'S_23', 'D_43', 'B_4', 'B_10', 'B_Ave', 'R_Total', 'S_3', 'D_42']
In [77]:
num_plots = len(features)
num_rows = (num_plots + 1) // 2 # to make sure we have enough rows for odd number of variables
num_cols = 2
# create subplots with the above calculated num_rows and num_cols
fig, axes = plt.subplots(num_rows, num_cols, figsize=(12,80))
axes = axes.flatten()
# iterate over the columns and create boxplots
for i, column in enumerate(features):
sns.boxplot(x = column, data=df4, ax=axes[i]) #orient='v')
#sns.despine(offset = 10, trim=True, ax=axes[i])
# hide any empty subplots
# for j in range(len(num), len(axes)):
# fig.delaxes(axes[j])
plt.tight_layout()
plt.show()
In [78]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Step 1: Cap and floor observations at 1st and 99th percentiles (Train sample)
lower_percentile = df4.drop(["customer_ID", "S_2", "target"], axis=1).quantile(0.01)
upper_percentile = df4.drop(["customer_ID", "S_2", "target"], axis=1).quantile(0.99)
df4_out_removed = df4.drop(["customer_ID", "S_2", "target"], axis=1).clip(lower=lower_percentile,
upper=upper_percentile,
axis=1)
# Step 3: Replace missing values with 0
df4_out_removed = df4_out_removed.fillna(0)
# Step 2: Standardize (normalize) the data based on Mean and Standard Deviation from Train sample
scaler = StandardScaler()
# Fit the scaler on the Train sample data
scaler.fit(df4_out_removed)
# Transform the Train, Test1, and Test2 samples
X = df4_out_removed # Use the capped and standardized data
y = df4['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
# Apply the same scaling to all data subsets
X_train = scaler.transform(X_train)
X_test1 = scaler.transform(X_test1)
X_test2 = scaler.transform(X_test2)
In [ ]:
Neural Network¶
In [79]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
2024-04-06 13:42:32.857711: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used. 2024-04-06 13:42:32.860904: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used. 2024-04-06 13:42:32.894045: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. 2024-04-06 13:42:33.721393: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
In [80]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import pandas as pd
from sklearn.metrics import roc_auc_score
# Assuming your datasets are defined: X_train, y_train, X_test1, y_test1, X_test2, y_test2
def build_model(hidden_layers, nodes, activation, dropout, input_shape):
model = Sequential()
model.add(tf.keras.Input(shape=(input_shape,))) # Adjust based on your dataset
for _ in range(hidden_layers):
model.add(Dense(nodes, activation=activation))
if dropout < 1.0:
model.add(Dropout(dropout))
model.add(Dense(1, activation='sigmoid')) # Assuming binary classification
return model
results_list = [] # List to hold all results before creating DataFrame
parameter_grid = [
(hl, n, af, d, bs)
for hl in [2, 4]
for n in [4, 6]
for af in ['relu', 'tanh']
for d in [0.5, 0.0] # 0.0 represents no dropout
for bs in [100, 10000]
]
input_shape = X_train.shape[1] # Ensure this matches your dataset
for hl, n, af, d, bs in parameter_grid:
model = build_model(hl, n, af, d, input_shape)
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC(name='auc')])
# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=bs, verbose=0)
# Evaluate model performance using roc_auc_score
auc_train = roc_auc_score(y_train, model.predict(X_train).ravel())
auc_test_1 = roc_auc_score(y_test1, model.predict(X_test1).ravel())
auc_test_2 = roc_auc_score(y_test2, model.predict(X_test2).ravel())
# Collect results
results_list.append({
'# HL': hl,
'# Node': n,
'Activation Function': af,
'Dropout': d,
'Batch Size': bs,
'AUC Train': auc_train,
'AUC Test 1': auc_test_1,
'AUC Test 2': auc_test_2
})
# Convert the list of dictionaries to a DataFrame
results_df_nn = pd.DataFrame(results_list)
# Save the DataFrame to a CSV file
results_df_nn.to_csv('grid_search_results_nn.csv', index=False)
print("Grid search completed and results saved.")
2024-04-06 13:42:36.684163: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355 2024-04-06 13:42:36.685587: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform. Skipping registering GPU devices...
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 582us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 559us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 570us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 9s 553us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 590us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 557us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 9s 514us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 533us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 571us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 8s 499us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 564us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 627us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 9s 533us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 551us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 518us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 9s 550us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 592us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 561us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 9s 547us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 571us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 557us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 9s 560us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 539us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 560us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 567us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 625us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 625us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 9s 559us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 604us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 627us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 9s 555us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 580us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 598us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 617us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 627us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 608us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 576us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 566us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 584us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 9s 550us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 584us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 615us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 582us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 613us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 591us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 611us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 644us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 597us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 607us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 669us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 624us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 575us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 618us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 622us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 602us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 607us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 648us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 585us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 612us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 659us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 604us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 625us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 635us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 606us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 664us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 654us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 597us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 596us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 576us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 588us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 627us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 583us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 597us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 631us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 641us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 600us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 614us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 610us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 11s 641us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 619us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 610us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 615us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 640us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 615us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 615us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 613us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 585us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 612us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 615us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 622us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 11s 648us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 621us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 679us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 594us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 635us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 637us/step Grid search completed and results saved.
In [81]:
results_df_nn
Out[81]:
| # HL | # Node | Activation Function | Dropout | Batch Size | AUC Train | AUC Test 1 | AUC Test 2 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 4 | relu | 0.5 | 100 | 0.909950 | 0.910248 | 0.909084 |
| 1 | 2 | 4 | relu | 0.5 | 10000 | 0.918418 | 0.919725 | 0.917964 |
| 2 | 2 | 4 | relu | 0.0 | 100 | 0.933205 | 0.934030 | 0.932357 |
| 3 | 2 | 4 | relu | 0.0 | 10000 | 0.927684 | 0.928676 | 0.927315 |
| 4 | 2 | 4 | tanh | 0.5 | 100 | 0.930401 | 0.931599 | 0.929836 |
| 5 | 2 | 4 | tanh | 0.5 | 10000 | 0.930554 | 0.931703 | 0.930125 |
| 6 | 2 | 4 | tanh | 0.0 | 100 | 0.933622 | 0.934339 | 0.932977 |
| 7 | 2 | 4 | tanh | 0.0 | 10000 | 0.930719 | 0.931448 | 0.929854 |
| 8 | 2 | 6 | relu | 0.5 | 100 | 0.931800 | 0.933030 | 0.931301 |
| 9 | 2 | 6 | relu | 0.5 | 10000 | 0.929381 | 0.930490 | 0.929016 |
| 10 | 2 | 6 | relu | 0.0 | 100 | 0.934423 | 0.935323 | 0.933645 |
| 11 | 2 | 6 | relu | 0.0 | 10000 | 0.931264 | 0.932065 | 0.930585 |
| 12 | 2 | 6 | tanh | 0.5 | 100 | 0.929642 | 0.930880 | 0.929157 |
| 13 | 2 | 6 | tanh | 0.5 | 10000 | 0.930797 | 0.932000 | 0.930362 |
| 14 | 2 | 6 | tanh | 0.0 | 100 | 0.934677 | 0.935297 | 0.933549 |
| 15 | 2 | 6 | tanh | 0.0 | 10000 | 0.930554 | 0.931508 | 0.929754 |
| 16 | 4 | 4 | relu | 0.5 | 100 | 0.929322 | 0.930723 | 0.928703 |
| 17 | 4 | 4 | relu | 0.5 | 10000 | 0.891645 | 0.892041 | 0.891669 |
| 18 | 4 | 4 | relu | 0.0 | 100 | 0.933497 | 0.934424 | 0.932836 |
| 19 | 4 | 4 | relu | 0.0 | 10000 | 0.931591 | 0.932588 | 0.930949 |
| 20 | 4 | 4 | tanh | 0.5 | 100 | 0.929506 | 0.930680 | 0.929208 |
| 21 | 4 | 4 | tanh | 0.5 | 10000 | 0.929591 | 0.930628 | 0.929005 |
| 22 | 4 | 4 | tanh | 0.0 | 100 | 0.933546 | 0.934173 | 0.932892 |
| 23 | 4 | 4 | tanh | 0.0 | 10000 | 0.930216 | 0.931005 | 0.929529 |
| 24 | 4 | 6 | relu | 0.5 | 100 | 0.930308 | 0.931446 | 0.929894 |
| 25 | 4 | 6 | relu | 0.5 | 10000 | 0.910467 | 0.910680 | 0.909214 |
| 26 | 4 | 6 | relu | 0.0 | 100 | 0.934501 | 0.935337 | 0.933631 |
| 27 | 4 | 6 | relu | 0.0 | 10000 | 0.927674 | 0.928096 | 0.926558 |
| 28 | 4 | 6 | tanh | 0.5 | 100 | 0.929873 | 0.930968 | 0.929383 |
| 29 | 4 | 6 | tanh | 0.5 | 10000 | 0.930363 | 0.931508 | 0.929952 |
| 30 | 4 | 6 | tanh | 0.0 | 100 | 0.934569 | 0.935102 | 0.933525 |
| 31 | 4 | 6 | tanh | 0.0 | 10000 | 0.931858 | 0.932815 | 0.931126 |
In [82]:
# calculate the mean auc
results_df_nn['Average AUC'] = results_df_nn[['AUC Train', 'AUC Test 1', 'AUC Test 2']].mean(axis=1)
# calculate the std for auc
results_df_nn['Std AUC'] = results_df_nn[['AUC Train', 'AUC Test 1', 'AUC Test 2']].std(axis=1)
results_df_nn
# results_df_xgb.iloc[:, 5:]
Out[82]:
| # HL | # Node | Activation Function | Dropout | Batch Size | AUC Train | AUC Test 1 | AUC Test 2 | Average AUC | Std AUC | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 4 | relu | 0.5 | 100 | 0.909950 | 0.910248 | 0.909084 | 0.909760 | 0.000605 |
| 1 | 2 | 4 | relu | 0.5 | 10000 | 0.918418 | 0.919725 | 0.917964 | 0.918703 | 0.000914 |
| 2 | 2 | 4 | relu | 0.0 | 100 | 0.933205 | 0.934030 | 0.932357 | 0.933197 | 0.000837 |
| 3 | 2 | 4 | relu | 0.0 | 10000 | 0.927684 | 0.928676 | 0.927315 | 0.927892 | 0.000704 |
| 4 | 2 | 4 | tanh | 0.5 | 100 | 0.930401 | 0.931599 | 0.929836 | 0.930612 | 0.000900 |
| 5 | 2 | 4 | tanh | 0.5 | 10000 | 0.930554 | 0.931703 | 0.930125 | 0.930794 | 0.000816 |
| 6 | 2 | 4 | tanh | 0.0 | 100 | 0.933622 | 0.934339 | 0.932977 | 0.933646 | 0.000682 |
| 7 | 2 | 4 | tanh | 0.0 | 10000 | 0.930719 | 0.931448 | 0.929854 | 0.930674 | 0.000798 |
| 8 | 2 | 6 | relu | 0.5 | 100 | 0.931800 | 0.933030 | 0.931301 | 0.932044 | 0.000890 |
| 9 | 2 | 6 | relu | 0.5 | 10000 | 0.929381 | 0.930490 | 0.929016 | 0.929629 | 0.000767 |
| 10 | 2 | 6 | relu | 0.0 | 100 | 0.934423 | 0.935323 | 0.933645 | 0.934464 | 0.000839 |
| 11 | 2 | 6 | relu | 0.0 | 10000 | 0.931264 | 0.932065 | 0.930585 | 0.931305 | 0.000740 |
| 12 | 2 | 6 | tanh | 0.5 | 100 | 0.929642 | 0.930880 | 0.929157 | 0.929893 | 0.000888 |
| 13 | 2 | 6 | tanh | 0.5 | 10000 | 0.930797 | 0.932000 | 0.930362 | 0.931053 | 0.000848 |
| 14 | 2 | 6 | tanh | 0.0 | 100 | 0.934677 | 0.935297 | 0.933549 | 0.934508 | 0.000886 |
| 15 | 2 | 6 | tanh | 0.0 | 10000 | 0.930554 | 0.931508 | 0.929754 | 0.930605 | 0.000879 |
| 16 | 4 | 4 | relu | 0.5 | 100 | 0.929322 | 0.930723 | 0.928703 | 0.929583 | 0.001035 |
| 17 | 4 | 4 | relu | 0.5 | 10000 | 0.891645 | 0.892041 | 0.891669 | 0.891785 | 0.000222 |
| 18 | 4 | 4 | relu | 0.0 | 100 | 0.933497 | 0.934424 | 0.932836 | 0.933585 | 0.000798 |
| 19 | 4 | 4 | relu | 0.0 | 10000 | 0.931591 | 0.932588 | 0.930949 | 0.931710 | 0.000826 |
| 20 | 4 | 4 | tanh | 0.5 | 100 | 0.929506 | 0.930680 | 0.929208 | 0.929798 | 0.000778 |
| 21 | 4 | 4 | tanh | 0.5 | 10000 | 0.929591 | 0.930628 | 0.929005 | 0.929741 | 0.000822 |
| 22 | 4 | 4 | tanh | 0.0 | 100 | 0.933546 | 0.934173 | 0.932892 | 0.933537 | 0.000640 |
| 23 | 4 | 4 | tanh | 0.0 | 10000 | 0.930216 | 0.931005 | 0.929529 | 0.930250 | 0.000739 |
| 24 | 4 | 6 | relu | 0.5 | 100 | 0.930308 | 0.931446 | 0.929894 | 0.930549 | 0.000804 |
| 25 | 4 | 6 | relu | 0.5 | 10000 | 0.910467 | 0.910680 | 0.909214 | 0.910120 | 0.000793 |
| 26 | 4 | 6 | relu | 0.0 | 100 | 0.934501 | 0.935337 | 0.933631 | 0.934490 | 0.000853 |
| 27 | 4 | 6 | relu | 0.0 | 10000 | 0.927674 | 0.928096 | 0.926558 | 0.927443 | 0.000794 |
| 28 | 4 | 6 | tanh | 0.5 | 100 | 0.929873 | 0.930968 | 0.929383 | 0.930075 | 0.000811 |
| 29 | 4 | 6 | tanh | 0.5 | 10000 | 0.930363 | 0.931508 | 0.929952 | 0.930608 | 0.000806 |
| 30 | 4 | 6 | tanh | 0.0 | 100 | 0.934569 | 0.935102 | 0.933525 | 0.934399 | 0.000802 |
| 31 | 4 | 6 | tanh | 0.0 | 10000 | 0.931858 | 0.932815 | 0.931126 | 0.931933 | 0.000847 |
In [83]:
plt.figure(figsize=(10, 5))
plt.scatter(results_df_nn['Average AUC'], results_df_nn['Std AUC'])
xlab = 'Average AUC Scores for Neural Net Model'
ylab = 'Standard Deviation of AUC Scores'
title = 'XGBoost Model Average AUC vs Std'
plt.xlabel(xlab)
plt.ylabel(ylab)
plt.title(title)
plt.show()
In [84]:
plt.figure(figsize=(10, 5))
plt.scatter(results_df_nn['AUC Train'], results_df_nn['AUC Test 2'])
xlab = 'AUC of Train sample for Neural Net Model'
ylab = 'AUC of Test 2'
title = 'Train AUC vs Test_2 AUC'
plt.xlabel(xlab)
plt.ylabel(ylab)
plt.title(title)
plt.show()
Best Neural Network based on Bias-Variance¶
In [85]:
# Analyze the results to get the optimal parameters
# Load the results
# results_df_nn = pd.read_csv('grid_search_results_nn.csv')
results_df_nn
# Find the row with the highest test AUC and smallest difference between train and test AUC
results_df_nn['AUC Diff'] = abs(results_df_nn['AUC Train'] - results_df_nn[['AUC Test 1', 'AUC Test 2']].mean(axis=1))
best_model_nn = results_df_nn.loc[(results_df_nn[['AUC Test 1', 'AUC Test 2']].mean(axis=1) - results_df_nn['AUC Diff']).idxmax()]
print("Optimal Parameters:")
print(best_model_nn)
Optimal Parameters: # HL 4 # Node 6 Activation Function relu Dropout 0.0 Batch Size 100 AUC Train 0.934501 AUC Test 1 0.935337 AUC Test 2 0.933631 Average AUC 0.93449 Std AUC 0.000853 AUC Diff 0.000016 Name: 26, dtype: object
In [86]:
# Extracting parameters from best_model_nn
hl = int(best_model_nn['# HL']) # Number of hidden layers
n = int(best_model_nn['# Node']) # Nodes per layer
af = best_model_nn['Activation Function'] # Activation function
d = float(best_model_nn['Dropout']) # Dropout rate
bs = int(best_model_nn['Batch Size']) # Batch size
# Define the final model (nn_final) with optimum parameters
nn_final = Sequential()
nn_final.add(tf.keras.Input(shape=(X_train.shape[1],))) # Use the actual input shape of your dataset
for _ in range(hl):
nn_final.add(Dense(n, activation=af))
if d < 1.0:
nn_final.add(Dropout(d))
nn_final.add(Dense(1, activation='sigmoid')) # Assuming a binary classification task
nn_final.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC(name='auc')])
# Train the model (nn_final) with optimal parameters
nn_final.fit(X_train, y_train, epochs=20, batch_size=bs, verbose=1)
Epoch 1/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - auc: 0.8941 - loss: 0.3413 Epoch 2/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9315 - loss: 0.2839 Epoch 3/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9327 - loss: 0.2804 Epoch 4/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9332 - loss: 0.2794 Epoch 5/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9330 - loss: 0.2801 Epoch 6/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9330 - loss: 0.2801 Epoch 7/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9334 - loss: 0.2791 Epoch 8/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9331 - loss: 0.2797 Epoch 9/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9333 - loss: 0.2796 Epoch 10/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9329 - loss: 0.2803 Epoch 11/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9340 - loss: 0.2780 Epoch 12/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9331 - loss: 0.2802 Epoch 13/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9340 - loss: 0.2777 Epoch 14/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9328 - loss: 0.2803 Epoch 15/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9333 - loss: 0.2791 Epoch 16/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9342 - loss: 0.2780 Epoch 17/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9334 - loss: 0.2789 Epoch 18/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9341 - loss: 0.2775 Epoch 19/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9334 - loss: 0.2788 Epoch 20/20 5375/5375 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - auc: 0.9340 - loss: 0.2781
Out[86]:
<keras.src.callbacks.history.History at 0x78e78c796970>
In [87]:
print(nn_final)
<Sequential name=sequential_32, built=True>
In [88]:
# Evaluate the model
auc_train = roc_auc_score(y_train, nn_final.predict(X_train).ravel())
auc_test_1 = roc_auc_score(y_test1, nn_final.predict(X_test1).ravel())
print(f"Retrained Model AUC: Train={auc_train}, Test1={auc_test_1}")
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 605us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 622us/step Retrained Model AUC: Train=0.9340498172065264, Test1=0.9346269463697459
In [89]:
from sklearn.metrics import classification_report, confusion_matrix
def model_performance(model, X_train, y_train, X_test1, y_test1, X_test2, y_test2):
# Getting predictions
train_preds = model.predict(X_train).ravel()
test_preds1 = model.predict(X_test1).ravel()
test_preds2 = model.predict(X_test2).ravel()
# Converting probabilities to class labels based on 0.5 threshold
train_preds = np.where(train_preds > 0.5, 1, 0)
test_preds1 = np.where(test_preds1 > 0.5, 1, 0)
test_preds2 = np.where(test_preds2 > 0.5, 1, 0)
# Classification reports
print('Classification report for training data')
print(classification_report(y_train, train_preds))
print('Classification report for testing data 1')
print(classification_report(y_test1, test_preds1))
print('Classification report for testing data 2')
print(classification_report(y_test2, test_preds2))
# Plotting confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(20, 5))
sns.heatmap(confusion_matrix(y_train, train_preds), annot=True, cbar=False, fmt='d', ax=axes[0])
axes[0].set_xlabel('Predicted labels')
axes[0].set_ylabel('Actual labels')
axes[0].set_title('Confusion Matrix for Training Data')
sns.heatmap(confusion_matrix(y_test1, test_preds1), annot=True, cbar=False, fmt='d', ax=axes[1])
axes[1].set_xlabel('Predicted labels')
axes[1].set_ylabel('Actual labels')
axes[1].set_title('Confusion Matrix for Testing Data 1')
sns.heatmap(confusion_matrix(y_test2, test_preds2), annot=True, cbar=False, fmt='d', ax=axes[2])
axes[2].set_xlabel('Predicted labels')
axes[2].set_ylabel('Actual labels')
axes[2].set_title('Confusion Matrix for Testing Data 2')
plt.tight_layout()
plt.show()
In [90]:
model_performance(nn_final, X_train, y_train, X_test1, y_test1, X_test2, y_test2)
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 598us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 633us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 665us/step Classification report for training data precision recall f1-score support 0 0.92 0.91 0.91 403583 1 0.74 0.76 0.75 133850 accuracy 0.87 537433 macro avg 0.83 0.84 0.83 537433 weighted avg 0.87 0.87 0.87 537433 Classification report for testing data 1 precision recall f1-score support 0 0.92 0.91 0.91 86376 1 0.74 0.76 0.75 28788 accuracy 0.87 115164 macro avg 0.83 0.84 0.83 115164 weighted avg 0.87 0.87 0.87 115164 Classification report for testing data 2 precision recall f1-score support 0 0.92 0.91 0.91 86273 1 0.73 0.76 0.75 28892 accuracy 0.87 115165 macro avg 0.83 0.83 0.83 115165 weighted avg 0.87 0.87 0.87 115165
In [91]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
def roc_score_auc_curve_nn(model, X_train, y_train, X_test1, y_test1, X_test2, y_test2):
# Getting the predicted probabilities
train_preds = model.predict(X_train).ravel()
test_preds1 = model.predict(X_test1).ravel()
test_preds2 = model.predict(X_test2).ravel()
# Calculating ROC AUC scores
a = round(roc_auc_score(y_train, train_preds), 4)
b = round(roc_auc_score(y_test1, test_preds1), 4)
c = round(roc_auc_score(y_test2, test_preds2), 4)
print('AUC Score for Model on Training Data is', a)
print('AUC Score for Model on Testing Data 1 is', b)
print('AUC Score for Model on Testing Data 2 is', c)
# Plotting ROC curves
plt.figure(figsize=(12, 7))
# Training Data
train_fpr, train_tpr, _ = roc_curve(y_train, train_preds)
plt.plot(train_fpr, train_tpr, label=f'Train AUC: {a}', marker='.')
# Test Data 1
test_fpr, test_tpr, _ = roc_curve(y_test1, test_preds1)
plt.plot(test_fpr, test_tpr, label=f'Test 1 AUC: {b}', marker='o')
# Test Data 2
test_fpr2, test_tpr2, _ = roc_curve(y_test2, test_preds2)
plt.plot(test_fpr2, test_tpr2, label=f'Test 2 AUC: {c}', marker='x')
# Line of No Discrimination
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='best')
plt.show()
In [92]:
roc_score_auc_curve_nn(nn_final, X_train, y_train, X_test1, y_test1, X_test2, y_test2)
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 11s 642us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 611us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 625us/step AUC Score for Model on Training Data is 0.934 AUC Score for Model on Testing Data 1 is 0.9346 AUC Score for Model on Testing Data 2 is 0.9329
Score Bins for Neural Nets¶
In [93]:
# Calculate AUC on each sample
y_train_pred = nn_final.predict(X_train)
y_test1_pred = nn_final.predict(X_test1)
y_test2_pred = nn_final.predict(X_test2)
auc_train = roc_auc_score(y_train, y_train_pred)
auc_test1 = roc_auc_score(y_test1, y_test1_pred)
auc_test2 = roc_auc_score(y_test2, y_test2_pred)
print(f"AUC on Train: {auc_train}")
print(f"AUC on Test 1: {auc_test1}")
print(f"AUC on Test 2: {auc_test2}")
# Define score bins based on the train sample
train_scores = y_train_pred.reshape(-1)
bins = np.percentile(train_scores, [0, 25, 50, 75, 100])
# Calculate the value ranges for each bin
bin_ranges = [f"{bins[i]:.2f}-{bins[i+1]:.2f}" for i in range(len(bins) - 1)]
# Apply the same thresholds to test samples
test1_scores = y_test1_pred.reshape(-1)
test2_scores = y_test2_pred.reshape(-1)
# Calculate default rates in each bin for each sample
def calculate_default_rate(scores, y):
default_rates = []
for i in range(len(bins) - 1):
mask = (scores >= bins[i]) & (scores <= bins[i + 1])
if np.sum(mask) > 0:
default_rate = np.sum(y[mask]) / np.sum(mask)
default_rates.append(default_rate)
else:
default_rates.append(0.0)
return default_rates
default_rates_train = calculate_default_rate(train_scores, y_train)
default_rates_test1 = calculate_default_rate(test1_scores, y_test1)
default_rates_test2 = calculate_default_rate(test2_scores, y_test2)
x_indexes = np.arange(len(bin_ranges))
width = 0.25
plt.figure(figsize=(12, 6))
plt.bar(x_indexes - width, default_rates_train, width=width, label='Train', align='center')
plt.bar(x_indexes, default_rates_test1, width=width, label='Test 1', align='center')
plt.bar(x_indexes + width, default_rates_test2, width=width, label='Test 2', align='center')
plt.xlabel('Score Bins')
plt.ylabel('Default Rate')
plt.title('Rank Orderings by Score Bins')
plt.xticks(x_indexes, bin_ranges, rotation=45) # Display bin ranges on the x-axis
plt.legend()
plt.grid(True)
plt.show()
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 11s 650us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 3s 743us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 669us/step AUC on Train: 0.9340498172065264 AUC on Test 1: 0.9346269463697459 AUC on Test 2: 0.9329086688709763
In [ ]:
Choosing Between NN and XGB¶
In [94]:
best_model_nn
Out[94]:
# HL 4 # Node 6 Activation Function relu Dropout 0.0 Batch Size 100 AUC Train 0.934501 AUC Test 1 0.935337 AUC Test 2 0.933631 Average AUC 0.93449 Std AUC 0.000853 AUC Diff 0.000016 Name: 26, dtype: object
In [96]:
best_model_xgb
Out[96]:
n_estimators 50 learning_rate 0.01 Subsample % 50.0% Features 50.0% % Weight of Default 5 AUC Train 0.926768 AUC Test 1 0.926775 AUC Test 2 0.925061 Average AUC 0.926201 Std AUC 0.000988 AUC Diff 0.000007 Name: 2, dtype: object
In [97]:
# Calculate average AUC scores directly within a dictionary for easier comparison
avg_aucs = {
"XGB Model": (best_model_xgb['AUC Test 1'] + best_model_xgb['AUC Test 2']) / 2,
"NN Model": (best_model_nn['AUC Test 1'] + best_model_nn['AUC Test 2']) / 2
}
# Print the average AUC values for both models
print(f"Avg AUC value of XGB Model: {avg_aucs['XGB Model']:.5f}")
print(f"Avg AUC value of NN Model: {avg_aucs['NN Model']:.5f}")
# Identify the best model based on the highest average AUC score
best_model, best_avg_auc = max(avg_aucs.items(), key=lambda item: item[1])
# Print the result stating the best model and its average AUC value
print(f"The best model is {best_model} with Avg AUC value of: {best_avg_auc:.5f}")
Avg AUC value of XGB Model: 0.92592 Avg AUC value of NN Model: 0.93448 The best model is NN Model with Avg AUC value of: 0.93448
In [ ]:
Exporting the Neural Network Model and XGBoost Model¶
In [98]:
xgb_final.save_model('best_xgb_model.json')
In [99]:
best_nn_model = nn_final
best_nn_model
best_nn_model.save('best_nn_model.keras')
In [ ]:
In [ ]:
Strategy¶
In [100]:
X = df3.drop(['customer_ID','S_2','target'], axis=1)
y = df3['target']
X_train, X_test1, y_train, y_test1 = train_test_split(X, y, test_size=0.3, random_state=42)
X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test1, y_test1, test_size=0.5, random_state=42)
In [102]:
# Get predictions for the train dataset
nn_predictions_train = (nn_final.predict(X_train) > 0.5).astype("int32")
# Get predictions for the test1 dataset
nn_predictions_test1 = (nn_final.predict(X_test1) > 0.5).astype("int32")
# Get predictions for the test2 dataset
nn_predictions_test2 = (nn_final.predict(X_test2) > 0.5).astype("int32")
# Get predicted probabilities for the train dataset
nn_probs_train = nn_final.predict(X_train)
# Get predicted probabilities for the test1 dataset
nn_probs_test1 = nn_final.predict(X_test1)
# Get predicted probabilities for the test2 dataset
nn_probs_test2 = nn_final.predict(X_test2)
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 12s 704us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 681us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 3s 720us/step 16795/16795 ━━━━━━━━━━━━━━━━━━━━ 10s 615us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 671us/step 3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 614us/step
Training Data¶
In [108]:
# Define the thresholds
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
# Calculate the average spend and balance for the last 6 months (November 2017 to April 2018)
start_date = pd.to_datetime("2017-11-01")
end_date = pd.to_datetime("2018-04-30")
date_feature_train = pd.to_datetime(df3['S_2']) # Convert to datetime if not already
filtered_data_train = df3[(date_feature_train >= start_date) & (date_feature_train <= end_date)]
average_spend_train = filtered_data_train['S_3'].mean()
average_balance_train = filtered_data_train['B_10'].mean()
# Calculate the monthly revenue for 1 customer
monthly_revenue_train = average_balance_train * 0.02 + average_spend_train * 0.001
# Calculate the expected annual revenue over the next 12 months
expected_revenue_train = monthly_revenue_train * 12
# Assuming your neural network model is fitted and ready to make predictions
# Get predicted probabilities for the train dataset using your TensorFlow model
# Flatten the output if it's not already flat
nn_probs_train_flat = nn_final.predict(X_train).flatten()
# Initialize an empty list to store dictionaries of the results
results = []
for threshold in thresholds:
# Determine accepted applicants based on the threshold
accepted_indices_train = nn_probs_train_flat < threshold
total_applicants_train = np.sum(accepted_indices_train)
defaulted_applicants_train = np.sum(y_train[accepted_indices_train])
# Avoid division by zero
default_rate_train = defaulted_applicants_train / total_applicants_train if total_applicants_train > 0 else 0
# Calculate the portfolio revenue
portfolio_revenue_value_train = expected_revenue_train * total_applicants_train
# Append a dictionary of results for this threshold
results.append({
'Threshold': threshold,
'Default Rate': default_rate_train,
'Portfolio Revenue': portfolio_revenue_value_train,
'Applicant Count': total_applicants_train
})
# Convert the list of dictionaries to a DataFrame
results_df = pd.DataFrame(results)
# Display the DataFrame
results_df
16795/16795 ━━━━━━━━━━━━━━━━━━━━ 11s 655us/step
Out[108]:
| Threshold | Default Rate | Portfolio Revenue | Applicant Count | |
|---|---|---|---|---|
| 0 | 0.1 | 0.216867 | 9.538479 | 166 |
| 1 | 0.2 | 0.110650 | 41.544097 | 723 |
| 2 | 0.3 | 0.010962 | 1493.863657 | 25998 |
| 3 | 0.4 | 0.038721 | 4359.889176 | 75876 |
| 4 | 0.5 | 0.112524 | 5792.844475 | 100814 |
| 5 | 0.6 | 0.201083 | 6897.009566 | 120030 |
| 6 | 0.7 | 0.235884 | 7283.547793 | 126757 |
| 7 | 0.8 | 0.245422 | 7382.610064 | 128481 |
| 8 | 0.9 | 0.249770 | 7428.463715 | 129279 |
| 9 | 1.0 | 0.256160 | 7502.243272 | 130563 |
In [113]:
import plotly.graph_objects as go
def plot_default_rate_and_revenue_vs_threshold(df):
fig = go.Figure()
# Default Rate
fig.add_trace(go.Scatter(x=df['Threshold'], y=df['Default Rate'],
mode='lines+markers',
name='Default Rate'))
# Portfolio Revenue
fig.add_trace(go.Scatter(x=df['Threshold'], y=df['Portfolio Revenue'],
mode='lines+markers',
name='Portfolio Revenue',
yaxis='y2'))
# Create axis objects
fig.update_layout(
title='Default Rate and Portfolio Revenue vs. Threshold',
xaxis=dict(title='Threshold'),
yaxis=dict(title='Default Rate'),
yaxis2=dict(title='Portfolio Revenue', overlaying='y', side='right'),
legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
height=600
)
fig.show()
plot_default_rate_and_revenue_vs_threshold(results_df)
In [118]:
from plotly.subplots import make_subplots
def plot_combined_metrics(df):
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(go.Scatter(x=df['Threshold'], y=df['Default Rate'], name="Default Rate"), secondary_y=False)
fig.add_trace(go.Scatter(x=df['Threshold'], y=df['Portfolio Revenue'], name="Portfolio Revenue"), secondary_y=True)
fig.add_trace(go.Bar(x=df['Threshold'], y=df['Applicant Count'], name="Applicant Count", marker_color='lightblue'), secondary_y=False)
# Add figure title
fig.update_layout(title_text="Combined Metrics vs. Threshold",
height=600)
# Set x-axis title
fig.update_xaxes(title_text="Threshold")
# Set y-axes titles
fig.update_yaxes(title_text="Default Rate / Applicant Count", secondary_y=False)
fig.update_yaxes(title_text="Portfolio Revenue", secondary_y=True)
fig.show()
plot_combined_metrics(results_df)
In [ ]:
Test 1¶
In [109]:
# Define the thresholds
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
# Calculate the average spend and balance for the last 6 months (November 2017 to April 2018)
start_date = pd.to_datetime("2017-11-01")
end_date = pd.to_datetime("2018-04-30")
date_feature_test1 = pd.to_datetime(df3['S_2']) # Convert to datetime if not already
filtered_data_test1 = df3[(date_feature_test1 >= start_date) & (date_feature_test1 <= end_date)]
average_spend_test1 = filtered_data_test1['S_3'].mean()
average_balance_test1 = filtered_data_test1['B_10'].mean()
# Calculate the monthly revenue for 1 customer based on test1 data
monthly_revenue_test1 = average_balance_test1 * 0.02 + average_spend_test1 * 0.001
# Calculate the expected annual revenue over the next 12 months based on test1 data
expected_revenue_test1 = monthly_revenue_test1 * 12
# Get predicted probabilities for the test1 dataset using your TensorFlow model
nn_probs_test1_flat = nn_final.predict(X_test1).flatten()
# Initialize an empty list to store dictionaries of results for test1 data
results_test1 = []
for threshold in thresholds:
# Determine accepted applicants based on the threshold for test1 data
accepted_indices_test1 = nn_probs_test1_flat < threshold
total_applicants_test1 = np.sum(accepted_indices_test1)
defaulted_applicants_test1 = np.sum(y_test1[accepted_indices_test1])
# Avoid division by zero
default_rate_test1 = defaulted_applicants_test1 / total_applicants_test1 if total_applicants_test1 > 0 else 0
# Calculate the portfolio revenue for test1 data
portfolio_revenue_value_test1 = expected_revenue_test1 * total_applicants_test1
# Append a dictionary of results for this threshold for test1 data
results_test1.append({
'Threshold': threshold,
'Default Rate': default_rate_test1,
'Portfolio Revenue': portfolio_revenue_value_test1,
'Applicant Count': total_applicants_test1
})
# Convert the list of dictionaries to a DataFrame for test1 data
results_df_test1 = pd.DataFrame(results_test1)
# Display the DataFrame for test1 data
results_df_test1
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 2s 631us/step
Out[109]:
| Threshold | Default Rate | Portfolio Revenue | Applicant Count | |
|---|---|---|---|---|
| 0 | 0.1 | 0.314286 | 2.011125 | 35 |
| 1 | 0.2 | 0.132911 | 9.078793 | 158 |
| 2 | 0.3 | 0.012256 | 323.503823 | 5630 |
| 3 | 0.4 | 0.038199 | 935.632815 | 16283 |
| 4 | 0.5 | 0.110537 | 1246.552741 | 21694 |
| 5 | 0.6 | 0.198991 | 1481.337221 | 25780 |
| 6 | 0.7 | 0.235149 | 1565.114943 | 27238 |
| 7 | 0.8 | 0.244901 | 1586.088103 | 27603 |
| 8 | 0.9 | 0.249721 | 1596.890718 | 27791 |
| 9 | 1.0 | 0.256587 | 1613.726707 | 28084 |
In [119]:
plot_default_rate_and_revenue_vs_threshold(results_df_test1)
In [120]:
plot_combined_metrics(results_df_test1)
Test 2¶
In [110]:
# Define the thresholds
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
# Calculate the average spend and balance for the last 6 months (November 2017 to April 2018)
start_date = pd.to_datetime("2017-11-01")
end_date = pd.to_datetime("2018-04-30")
date_feature_test2 = pd.to_datetime(df3['S_2']) # Convert to datetime if not already
filtered_data_test2 = df3[(date_feature_test2 >= start_date) & (date_feature_test2 <= end_date)]
average_spend_test2 = filtered_data_test2['S_3'].mean()
average_balance_test2 = filtered_data_test2['B_10'].mean()
# Calculate the monthly revenue for 1 customer based on test2 data
monthly_revenue_test2 = average_balance_test2 * 0.02 + average_spend_test2 * 0.001
# Calculate the expected annual revenue over the next 12 months based on test2 data
expected_revenue_test2 = monthly_revenue_test2 * 12
# Get predicted probabilities for the test2 dataset using your TensorFlow model
nn_probs_test2_flat = nn_final.predict(X_test2).flatten()
# Initialize an empty list to store dictionaries of results for test2 data
results_test2 = []
for threshold in thresholds:
# Determine accepted applicants based on the threshold for test2 data
accepted_indices_test2 = nn_probs_test2_flat < threshold
total_applicants_test2 = np.sum(accepted_indices_test2)
defaulted_applicants_test2 = np.sum(y_test2[accepted_indices_test2])
# Avoid division by zero
default_rate_test2 = defaulted_applicants_test2 / total_applicants_test2 if total_applicants_test2 > 0 else 0
# Calculate the portfolio revenue for test2 data
portfolio_revenue_value_test2 = expected_revenue_test2 * total_applicants_test2
# Append a dictionary of results for this threshold for test2 data
results_test2.append({
'Threshold': threshold,
'Default Rate': default_rate_test2,
'Portfolio Revenue': portfolio_revenue_value_test2,
'Applicant Count': total_applicants_test2
})
# Convert the list of dictionaries to a DataFrame for test2 data
results_df_test2 = pd.DataFrame(results_test2)
# Display the DataFrame for test2 data
results_df_test2
3599/3599 ━━━━━━━━━━━━━━━━━━━━ 3s 694us/step
Out[110]:
| Threshold | Default Rate | Portfolio Revenue | Applicant Count | |
|---|---|---|---|---|
| 0 | 0.1 | 0.215686 | 2.930496 | 51 |
| 1 | 0.2 | 0.141935 | 8.906411 | 155 |
| 2 | 0.3 | 0.011758 | 322.526991 | 5613 |
| 3 | 0.4 | 0.041615 | 929.254675 | 16172 |
| 4 | 0.5 | 0.117014 | 1239.427613 | 21570 |
| 5 | 0.6 | 0.206836 | 1480.992456 | 25774 |
| 6 | 0.7 | 0.241552 | 1562.644132 | 27195 |
| 7 | 0.8 | 0.251034 | 1583.962057 | 27566 |
| 8 | 0.9 | 0.255873 | 1594.879593 | 27756 |
| 9 | 1.0 | 0.261905 | 1609.704457 | 28014 |
In [121]:
plot_default_rate_and_revenue_vs_threshold(results_df_test2)
In [122]:
plot_combined_metrics(results_df_test2)
In [ ]: